%s:%d

From: Afif Elghraoui Date: Sun, 2 Jul 2017 07:50:19 +0000 (-0400) Subject: Imported Upstream version 0.11.2.2+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~12^2~16 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=1520aab08562a7f44fd4570ea351a3dbd5db5a35;p=python-pysam.git Imported Upstream version 0.11.2.2+ds --- diff --git a/.gitignore b/.gitignore index 598948d..0910be8 100644 --- a/.gitignore +++ b/.gitignore @@ -23,19 +23,7 @@ htslib/config.mk pysam/config.py # cython files -pysam/TabProxies.c -pysam/csamtools.c -pysam/ctabix.c -pysam/cvcf.c -pysam/chtslib.c -pysam/cutils.c -pysam/calignedsegment.c -pysam/calignmentfile.c -pysam/cbcf.c -pysam/cfaidx.c -pysam/chtslib.c -pysam/csamfile.c -pysam/ctabixproxies.c +pysam/libc*.c ###### Generic python ignores below ###### diff --git a/MANIFEST.in b/MANIFEST.in index be43691..3f2a9cb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -10,9 +10,9 @@ include KNOWN_BUGS include THANKS include cy_build.py include requirements.txt -include pysam/c*.pxd -include pysam/c*.pyx -include pysam/c*.c +include pysam/libc*.pxd +include pysam/libc*.pyx +include pysam/libc*.c include pysam/*.c include pysam/*.h include samtools/configure @@ -29,6 +29,7 @@ include htslib/htslib_vars.mk include htslib/configure include htslib/config.mk.in include htslib/config.h.in +include htslib/htslib.pc.in include htslib/htslib/*.h include htslib/cram/*.c include htslib/cram/*.h diff --git a/bcftools/HMM.c b/bcftools/HMM.c index 9196544..5795987 100644 --- a/bcftools/HMM.c +++ b/bcftools/HMM.c @@ -31,6 +31,17 @@ #include #include "HMM.h" +typedef struct +{ + int nstates; // number of hmm's states + int isite; // take snapshot at i-th position + uint32_t pos; // i-th site's position + double *vit_prob; // viterbi probabilities, NULL for uniform probs + double *fwd_prob; // transition probabilities + double *bwd_prob; // transition probabilities +} +snapshot_t; + struct _hmm_t { int nstates; // number of states @@ -50,7 +61,8 @@ struct _hmm_t set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities // at each site (one step of Viterbi algorithm) void *set_tprob_data; - double *init_probs; // Initial state probabilities, NULL for uniform probs + snapshot_t init; // Initial state probabilities. Set isite=1 when site should be used + snapshot_t *snapshot; }; uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; } @@ -78,28 +90,79 @@ static inline void multiply_matrix(int n, double *a, double *b, double *dst, dou memcpy(dst,out,sizeof(double)*n*n); } +void hmm_init_states(hmm_t *hmm, double *probs) +{ + hmm->init.isite = 0; + hmm->init.pos = 0; + if ( !hmm->init.vit_prob ) + hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates); + if ( !hmm->init.fwd_prob ) + hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates); + if ( !hmm->init.bwd_prob ) + hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates); + + int i; + if ( probs ) + { + memcpy(hmm->init.vit_prob,probs,sizeof(double)*hmm->nstates); + double sum = 0; + for (i=0; instates; i++) sum += hmm->init.vit_prob[i]; + for (i=0; instates; i++) hmm->init.vit_prob[i] /= sum; + } + else + for (i=0; instates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates; + + memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates); + memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates); +} hmm_t *hmm_init(int nstates, double *tprob, int ntprob) { hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t)); hmm->nstates = nstates; hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates); hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates); - hmm_set_tprob(hmm, tprob, ntprob); - + hmm_init_states(hmm, NULL); return hmm; } -void hmm_init_states(hmm_t *hmm, double *probs) +void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite) { - if ( !probs ) + snapshot_t *snapshot = (snapshot_t*) _snapshot; + if ( snapshot && snapshot->nstates!=hmm->nstates ) { - free(hmm->init_probs); - hmm->init_probs = NULL; + free(snapshot); + snapshot = NULL; } - - if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates); - memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates); + if ( !snapshot ) + { + // Allocate the snapshot as a single memory block so that it can be + // free()-ed by the user. So make sure the arrays are aligned.. + size_t str_size = sizeof(snapshot_t); + size_t dbl_size = sizeof(double); + size_t pad_size = (dbl_size - str_size % dbl_size) % dbl_size; + uint8_t *mem = (uint8_t*) malloc(str_size + pad_size + dbl_size*2*hmm->nstates); + snapshot = (snapshot_t*) mem; + snapshot->nstates = hmm->nstates; + snapshot->vit_prob = (double*) (mem + str_size + pad_size); + snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates; + } + snapshot->isite = isite; + hmm->snapshot = snapshot; + return snapshot; +} +void hmm_restore(hmm_t *hmm, void *_snapshot) +{ + snapshot_t *snapshot = (snapshot_t*) _snapshot; + if ( !snapshot ) + { + hmm->init.isite = 0; + return; + } + hmm->init.isite = 1; + hmm->init.pos = snapshot->pos; + memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates); + memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates); } void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob) @@ -154,23 +217,18 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates); } - // Init all states with equal likelihood int i,j, nstates = hmm->nstates; - if ( hmm->init_probs ) - for (i=0; ivprob[i] = hmm->init_probs[i]; - else - for (i=0; ivprob[i] = 1./nstates; + memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates); + uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0]; // Run Viterbi - uint32_t prev_pos = sites[0]; for (i=0; ivpath[i*nstates]; double *eprob = &eprobs[i*nstates]; int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1; - _set_tprob(hmm, pos_diff); if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob); prev_pos = sites[i]; @@ -191,6 +249,12 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) } for (j=0; jvprob_tmp[j] /= vnorm; double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp; + + if ( hmm->snapshot && i==hmm->snapshot->isite ) + { + hmm->snapshot->pos = sites[i]; + memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates); + } } // Find the most likely state @@ -224,19 +288,12 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) // Init all states with equal likelihood int i,j,k, nstates = hmm->nstates; - if ( hmm->init_probs ) - { - for (i=0; ifwd[i] = hmm->init_probs[i]; - for (i=0; ibwd[i] = hmm->init_probs[i]; - } - else - { - for (i=0; ifwd[i] = 1./hmm->nstates; - for (i=0; ibwd[i] = 1./hmm->nstates; - } + memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates); + memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates); + + uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0]; // Run fwd - uint32_t prev_pos = sites[0]; for (i=0; ifwd[i*nstates]; @@ -261,6 +318,13 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) for (j=0; jsnapshot ) + { + i = hmm->snapshot->isite; + hmm->snapshot->pos = sites[i]; + memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates); + } + // Run bwd double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp; prev_pos = sites[n-1]; @@ -296,7 +360,7 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) } } -void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) +double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) { // Init arrays when run for the first time if ( hmm->nfwd < n ) @@ -312,16 +376,9 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) // Init all states with equal likelihood int i,j,k, nstates = hmm->nstates; - if ( hmm->init_probs ) - { - for (i=0; ifwd[i] = hmm->init_probs[i]; - for (i=0; ibwd[i] = hmm->init_probs[i]; - } - else - { - for (i=0; ifwd[i] = 1./hmm->nstates; - for (i=0; ibwd[i] = 1./hmm->nstates; - } + memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates); + memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates); + uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0]; // New transition matrix: temporary values double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double)); @@ -329,7 +386,6 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) double *fwd_bwd = (double*) malloc(sizeof(double)*nstates); // Run fwd - uint32_t prev_pos = sites[0]; for (i=0; ifwd[i*nstates]; @@ -416,11 +472,14 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) free(tmp_gamma); free(tmp_xi); free(fwd_bwd); + return hmm->curr_tprob; } void hmm_destroy(hmm_t *hmm) { - free(hmm->init_probs); + free(hmm->init.vit_prob); + free(hmm->init.fwd_prob); + free(hmm->init.bwd_prob); free(hmm->vprob); free(hmm->vprob_tmp); free(hmm->vpath); diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c index a3b91ff..513da35 100644 --- a/bcftools/HMM.c.pysam.c +++ b/bcftools/HMM.c.pysam.c @@ -33,6 +33,17 @@ #include #include "HMM.h" +typedef struct +{ + int nstates; // number of hmm's states + int isite; // take snapshot at i-th position + uint32_t pos; // i-th site's position + double *vit_prob; // viterbi probabilities, NULL for uniform probs + double *fwd_prob; // transition probabilities + double *bwd_prob; // transition probabilities +} +snapshot_t; + struct _hmm_t { int nstates; // number of states @@ -52,7 +63,8 @@ struct _hmm_t set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities // at each site (one step of Viterbi algorithm) void *set_tprob_data; - double *init_probs; // Initial state probabilities, NULL for uniform probs + snapshot_t init; // Initial state probabilities. Set isite=1 when site should be used + snapshot_t *snapshot; }; uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; } @@ -80,28 +92,79 @@ static inline void multiply_matrix(int n, double *a, double *b, double *dst, dou memcpy(dst,out,sizeof(double)*n*n); } +void hmm_init_states(hmm_t *hmm, double *probs) +{ + hmm->init.isite = 0; + hmm->init.pos = 0; + if ( !hmm->init.vit_prob ) + hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates); + if ( !hmm->init.fwd_prob ) + hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates); + if ( !hmm->init.bwd_prob ) + hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates); + + int i; + if ( probs ) + { + memcpy(hmm->init.vit_prob,probs,sizeof(double)*hmm->nstates); + double sum = 0; + for (i=0; instates; i++) sum += hmm->init.vit_prob[i]; + for (i=0; instates; i++) hmm->init.vit_prob[i] /= sum; + } + else + for (i=0; instates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates; + + memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates); + memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates); +} hmm_t *hmm_init(int nstates, double *tprob, int ntprob) { hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t)); hmm->nstates = nstates; hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates); hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates); - hmm_set_tprob(hmm, tprob, ntprob); - + hmm_init_states(hmm, NULL); return hmm; } -void hmm_init_states(hmm_t *hmm, double *probs) +void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite) { - if ( !probs ) + snapshot_t *snapshot = (snapshot_t*) _snapshot; + if ( snapshot && snapshot->nstates!=hmm->nstates ) { - free(hmm->init_probs); - hmm->init_probs = NULL; + free(snapshot); + snapshot = NULL; } - - if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates); - memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates); + if ( !snapshot ) + { + // Allocate the snapshot as a single memory block so that it can be + // free()-ed by the user. So make sure the arrays are aligned.. + size_t str_size = sizeof(snapshot_t); + size_t dbl_size = sizeof(double); + size_t pad_size = (dbl_size - str_size % dbl_size) % dbl_size; + uint8_t *mem = (uint8_t*) malloc(str_size + pad_size + dbl_size*2*hmm->nstates); + snapshot = (snapshot_t*) mem; + snapshot->nstates = hmm->nstates; + snapshot->vit_prob = (double*) (mem + str_size + pad_size); + snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates; + } + snapshot->isite = isite; + hmm->snapshot = snapshot; + return snapshot; +} +void hmm_restore(hmm_t *hmm, void *_snapshot) +{ + snapshot_t *snapshot = (snapshot_t*) _snapshot; + if ( !snapshot ) + { + hmm->init.isite = 0; + return; + } + hmm->init.isite = 1; + hmm->init.pos = snapshot->pos; + memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates); + memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates); } void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob) @@ -156,23 +219,18 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates); } - // Init all states with equal likelihood int i,j, nstates = hmm->nstates; - if ( hmm->init_probs ) - for (i=0; ivprob[i] = hmm->init_probs[i]; - else - for (i=0; ivprob[i] = 1./nstates; + memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates); + uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0]; // Run Viterbi - uint32_t prev_pos = sites[0]; for (i=0; ivpath[i*nstates]; double *eprob = &eprobs[i*nstates]; int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1; - _set_tprob(hmm, pos_diff); if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob); prev_pos = sites[i]; @@ -193,6 +251,12 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) } for (j=0; jvprob_tmp[j] /= vnorm; double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp; + + if ( hmm->snapshot && i==hmm->snapshot->isite ) + { + hmm->snapshot->pos = sites[i]; + memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates); + } } // Find the most likely state @@ -226,19 +290,12 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) // Init all states with equal likelihood int i,j,k, nstates = hmm->nstates; - if ( hmm->init_probs ) - { - for (i=0; ifwd[i] = hmm->init_probs[i]; - for (i=0; ibwd[i] = hmm->init_probs[i]; - } - else - { - for (i=0; ifwd[i] = 1./hmm->nstates; - for (i=0; ibwd[i] = 1./hmm->nstates; - } + memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates); + memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates); + + uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0]; // Run fwd - uint32_t prev_pos = sites[0]; for (i=0; ifwd[i*nstates]; @@ -263,6 +320,13 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) for (j=0; jsnapshot ) + { + i = hmm->snapshot->isite; + hmm->snapshot->pos = sites[i]; + memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates); + } + // Run bwd double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp; prev_pos = sites[n-1]; @@ -298,7 +362,7 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) } } -void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) +double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) { // Init arrays when run for the first time if ( hmm->nfwd < n ) @@ -314,16 +378,9 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) // Init all states with equal likelihood int i,j,k, nstates = hmm->nstates; - if ( hmm->init_probs ) - { - for (i=0; ifwd[i] = hmm->init_probs[i]; - for (i=0; ibwd[i] = hmm->init_probs[i]; - } - else - { - for (i=0; ifwd[i] = 1./hmm->nstates; - for (i=0; ibwd[i] = 1./hmm->nstates; - } + memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates); + memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates); + uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0]; // New transition matrix: temporary values double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double)); @@ -331,7 +388,6 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) double *fwd_bwd = (double*) malloc(sizeof(double)*nstates); // Run fwd - uint32_t prev_pos = sites[0]; for (i=0; ifwd[i*nstates]; @@ -418,11 +474,14 @@ void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites) free(tmp_gamma); free(tmp_xi); free(fwd_bwd); + return hmm->curr_tprob; } void hmm_destroy(hmm_t *hmm) { - free(hmm->init_probs); + free(hmm->init.vit_prob); + free(hmm->init.fwd_prob); + free(hmm->init.bwd_prob); free(hmm->vprob); free(hmm->vprob_tmp); free(hmm->vpath); diff --git a/bcftools/HMM.h b/bcftools/HMM.h index 7f01245..3e5cf7f 100644 --- a/bcftools/HMM.h +++ b/bcftools/HMM.h @@ -44,6 +44,10 @@ typedef void (*set_tprob_f) (hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void * hmm_t *hmm_init(int nstates, double *tprob, int ntprob); void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob); +#define HMM_VIT 1 +#define HMM_FWD 2 +#define HMM_BWD 4 + /** * hmm_init_states() - initial state probabilities * @probs: initial state probabilities or NULL to reset to default @@ -52,6 +56,20 @@ void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob); */ void hmm_init_states(hmm_t *hmm, double *probs); +/** + * hmm_snapshot() - take the model's snapshot, intended for sliding HMM + * @snapshot: NULL or snapshot returned by previous hmm_snapshot() call, must be free()-ed by the caller + * @isite: take the snapshot at i-th step + */ +void *hmm_snapshot(hmm_t *hmm, void *snapshot, int isite); + +/** + * hmm_restore() - restore model's snapshot, intended for sliding HMM + * @snapshot: snapshot returned by hmm_snapshot() call or NULL to reset + * @isite: take the snapshot at i-th step + */ +void hmm_restore(hmm_t *hmm, void *snapshot); + /** * hmm_get_tprob() - return the array of transition matrices, precalculated * to ntprob positions. The first matrix is the initial tprob matrix @@ -103,11 +121,11 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm); * @eprob: emission probabilities for each site and state (nsites x nstates) * @sites: list of positions * - * Same as hmm_run_fwd_bwd, in addition curr_tprob contains the new - * transition probabilities. In this verison, emission probabilities - * are not updated. + * Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new + * transition probabilities is returned. In this verison, emission + * probabilities are not updated. */ -void hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites); +double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites); void hmm_destroy(hmm_t *hmm); diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c new file mode 100644 index 0000000..b4fb7f1 --- /dev/null +++ b/bcftools/bam2bcf.c @@ -0,0 +1,857 @@ +/* bam2bcf.c -- variant calling. + + Copyright (C) 2010-2012 Broad Institute. + Copyright (C) 2012-2014 Genome Research Ltd. + + Author: Heng Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bam2bcf.h" + +extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); + +#define CALL_DEFTHETA 0.83 +#define DEF_MAPQ 20 + +#define CAP_DIST 25 + +bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) +{ + bcf_callaux_t *bca; + if (theta <= 0.) theta = CALL_DEFTHETA; + bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t)); + bca->capQ = 60; + bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; + bca->min_baseQ = min_baseQ; + bca->e = errmod_init(1. - theta); + bca->min_frac = 0.002; + bca->min_support = 1; + bca->per_sample_flt = 0; + bca->npos = 100; + bca->ref_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->alt_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->nqual = 60; + bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int)); + bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int)); + bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int)); + bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int)); + return bca; +} + +void bcf_call_destroy(bcf_callaux_t *bca) +{ + if (bca == 0) return; + errmod_destroy(bca->e); + if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } + free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); + free(bca->fwd_mqs); free(bca->rev_mqs); + bca->nqual = 0; + free(bca->bases); free(bca->inscns); free(bca); +} + +// position in the sequence with respect to the aligned part of the read +static int get_position(const bam_pileup1_t *p, int *len) +{ + int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; + for (icig=0; icigb->core.n_cigar; icig++) + { + int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; + int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) + { + n_tot_bases += ncig; + iread += ncig; + continue; + } + if ( cig==BAM_CINS ) + { + n_tot_bases += ncig; + iread += ncig; + continue; + } + if ( cig==BAM_CSOFT_CLIP ) + { + iread += ncig; + if ( iread<=p->qpos ) edist -= ncig; + continue; + } + if ( cig==BAM_CDEL ) continue; + if ( cig==BAM_CHARD_CLIP ) continue; + if ( cig==BAM_CPAD ) continue; + if ( cig==BAM_CREF_SKIP ) continue; + fprintf(stderr,"todo: cigar %d\n", cig); + assert(0); + } + *len = n_tot_bases; + return edist; +} + +void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) +{ + memset(bca->ref_pos,0,sizeof(int)*bca->npos); + memset(bca->alt_pos,0,sizeof(int)*bca->npos); + memset(bca->ref_mq,0,sizeof(int)*bca->nqual); + memset(bca->alt_mq,0,sizeof(int)*bca->nqual); + memset(bca->ref_bq,0,sizeof(int)*bca->nqual); + memset(bca->alt_bq,0,sizeof(int)*bca->nqual); + memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); + memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); + if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); + if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); +} + +/* + Notes: + - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies + which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. + Later it's used for multiallelic calling by bcftools -m + - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. + */ +/* + * This function is called once for each sample. + * _n is number of pilesups pl contributing reads to this sample + * pl is pointer to array of _n pileups (one pileup per read) + * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. + * bca is the settings to perform calls across all samples + * r is the returned value of the call + */ +int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) +{ + int i, n, ref4, is_indel, ori_depth = 0; + + // clean from previous run + r->ori_depth = 0; + r->mq0 = 0; + memset(r->qsum,0,sizeof(float)*4); + memset(r->anno,0,sizeof(double)*16); + memset(r->p,0,sizeof(float)*25); + + if (ref_base >= 0) { + ref4 = seq_nt16_int[ref_base]; + is_indel = 0; + } else ref4 = 4, is_indel = 1; + if (_n == 0) return -1; + // enlarge the bases array if necessary + if (bca->max_bases < _n) { + bca->max_bases = _n; + kroundup32(bca->max_bases); + bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); + } + // fill the bases array + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; + // set base + if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; + ++ori_depth; + mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 + if ( !mapQ ) r->mq0++; + baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality + seqQ = is_indel? (p->aux>>8&0xff) : 99; + if (q < bca->min_baseQ) continue; + if (q > seqQ) q = seqQ; + mapQ = mapQ < bca->capQ? mapQ : bca->capQ; + if (q > mapQ) q = mapQ; + if (q > 63) q = 63; + if (q < 4) q = 4; // MQ=0 reads count as BQ=4 + if (!is_indel) { + b = bam_seqi(bam_get_seq(p->b), p->qpos); // base + b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base + is_diff = (ref4 < 4 && b == ref4)? 0 : 1; + } else { + b = p->aux>>16&0x3f; + is_diff = (b != 0); + } + bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; + // collect annotations + if (b < 4) + { + r->qsum[b] += q; + if ( r->ADF ) + { + if ( bam_is_rev(p->b) ) + r->ADR[b]++; + else + r->ADF[b]++; + } + } + ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)]; + min_dist = p->b->core.l_qseq - 1 - p->qpos; + if (min_dist > p->qpos) min_dist = p->qpos; + if (min_dist > CAP_DIST) min_dist = CAP_DIST; + r->anno[1<<2|is_diff<<1|0] += baseQ; + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; + r->anno[2<<2|is_diff<<1|0] += mapQ; + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; + r->anno[3<<2|is_diff<<1|0] += min_dist; + r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; + + // collect for bias tests + if ( baseQ > 59 ) baseQ = 59; + if ( mapQ > 59 ) mapQ = 59; + int len, pos = get_position(p, &len); + int epos = (double)pos/(len+1) * bca->npos; + int ibq = baseQ/60. * bca->nqual; + int imq = mapQ/60. * bca->nqual; + if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; + else bca->fwd_mqs[imq]++; + if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) + { + bca->ref_pos[epos]++; + bca->ref_bq[ibq]++; + bca->ref_mq[imq]++; + } + else + { + bca->alt_pos[epos]++; + bca->alt_bq[ibq]++; + bca->alt_mq[imq]++; + } + } + r->ori_depth = ori_depth; + // glfgen + errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype + return n; +} + + +/* + * calc_vdb() - returns value between zero (most biased) and one (no bias) + * on success, or HUGE_VAL when VDB cannot be calculated because + * of insufficient depth (<2x) + * + * Variant Distance Bias tests if the variant bases are positioned within the + * reads with sufficient randomness. Unlike other tests, it looks only at + * variant reads and therefore gives different kind of information than Read + * Position Bias for instance. VDB was developed for detecting artefacts in + * RNA-seq calls where reads from spliced transcripts span splice site + * boundaries. The current implementation differs somewhat from the original + * version described in supplementary material of PMID:22524474, but the idea + * remains the same. (Here the random variable tested is the average distance + * from the averaged position, not the average pairwise distance.) + * + * For coverage of 2x, the calculation is exact but is approximated for the + * rest. The result is most accurate between 4-200x. For 3x or >200x, the + * reported values are slightly more favourable than those of a true random + * distribution. + */ +double calc_vdb(int *pos, int npos) +{ + // Note well: the parameters were obtained by fitting to simulated data of + // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen(). + const int readlen = 100; + assert( npos==readlen ); + + #define nparam 15 + const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5}, + {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8}, + {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7}, + {200,0.7,23.7} }; + + int i, dp = 0; + float mean_pos = 0, mean_diff = 0; + for (i=0; i=200 ) + i = nparam; // shortcut for big depths + else + { + for (i=0; i=dp ) break; + } + float pshift, pscale; + if ( i==nparam ) + { + // the depth is too high, go with 200x + pscale = param[nparam-1][1]; + pshift = param[nparam-1][2]; + } + else if ( i>0 && param[i][0]!=dp ) + { + // linear interpolation of parameters + pscale = (param[i-1][1] + param[i][1])*0.5; + pshift = (param[i-1][2] + param[i][2])*0.5; + } + else + { + pscale = param[i][1]; + pshift = param[i][2]; + } + return 0.5*kf_erfc(-(mean_diff-pshift)*pscale); +} + +double calc_chisq_bias(int *a, int *b, int n) +{ + int na = 0, nb = 0, i, ndf = n; + for (i=0; i= 2 && m >= 2); + + return (n < 8 && m < 8 && U < 50) + ? mw[n-2][m-2][U] + : mann_whitney_1947_(n,m,U); +} + +double mann_whitney_1947_cdf(int n, int m, int U) +{ + int i; + double sum = 0; + for (i=0; i<=U; i++) + sum += mann_whitney_1947(n,m,i); + return sum; +} + +double calc_mwu_bias_cdf(int *a, int *b, int n) +{ + int na = 0, nb = 0, i; + double U = 0, ties = 0; + for (i=0; i=8 && nb>=8 and reasonable if na<8 or nb<8 + if ( na>=8 || nb>=8 ) + { + double mean = ((double)na*nb)*0.5; + // Correction for ties: + // double N = na+nb; + // double var2 = (N*N-1)*N-ties; + // if ( var2==0 ) return 1.0; + // var2 *= ((double)na*nb)/N/(N-1)/12.0; + // No correction for ties: + double var2 = ((double)na*nb)*(na+nb+1)/12.0; + double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1) + return 2.0 - kf_erfc(z); // which is 1 + erf(z) + } + + // Exact calculation + double pval = 2*mann_whitney_1947_cdf(na,nb,U_min); + return pval>1 ? 1 : pval; +} + +double calc_mwu_bias(int *a, int *b, int n) +{ + int na = 0, nb = 0, i; + double U = 0, ties = 0; + for (i=0; imean ? (2.0*mean-U)/mean : U/mean; + } + // Correction for ties: + // double N = na+nb; + // double var2 = (N*N-1)*N-ties; + // if ( var2==0 ) return 1.0; + // var2 *= ((double)na*nb)/N/(N-1)/12.0; + // No correction for ties: + double var2 = ((double)na*nb)*(na+nb+1)/12.0; + if ( na>=8 || nb>=8 ) + { + // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8 + return exp(-0.5*(U-mean)*(U-mean)/var2); + } + + // Exact calculation + return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); +} + +static inline double logsumexp2(double a, double b) +{ + if ( a>b ) + return log(1 + exp(b-a)) + a; + else + return log(1 + exp(a-b)) + b; +} + +void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) +{ + call->seg_bias = HUGE_VAL; + if ( !bcr ) return; + + int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads + if ( !nr ) return; + + int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth + double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population + if ( M>call->n ) M = call->n; // clamp M at the number of samples + else if ( M==0 ) M = 1; + double f = M / 2. / call->n; // allele frequency + double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson) + double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson) + double sum = 0; + const double log2 = log(2.0); + + // fprintf(stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); + int i; + for (i=0; in; i++) + { + int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads + double tmp; + if ( oi ) + { + // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow + tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q); + tmp += log(f) + oi*log(q/p) - q + p; + } + else + tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; + sum += tmp; + // fprintf(stderr,"oi=%d %e\n", oi,tmp); + } + call->seg_bias = sum; +} + +/** + * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles + * @n: number of samples + * @calls: each sample's calls + * @bca: auxiliary data structure for holding temporary values + * @ref_base: the reference base + * @call: filled with the annotations + * + * Combines calls across the various samples being studied + * 1. For each allele at each base across all samples the quality is summed so + * you end up with a set of quality sums for each allele present 2. The quality + * sums are sorted. + * 3. Using the sorted quality sums we now create the allele ordering array + * A\subN. This is done by doing the following: + * a) If the reference allele is known it always comes first, otherwise N + * comes first. + * b) Then the rest of the alleles are output in descending order of quality + * sum (which we already know the qsum array was sorted). Any allelles with + * qsum 0 will be excluded. + * 4. Using the allele ordering array we create the genotype ordering array. + * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1 + * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4 + * 5. The genotype ordering array is then used to extract data from the error + * model 5*5 matrix and is used to produce a Phread likelihood array for each + * sample. + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) +{ + int ref4, i, j; + float qsum[5] = {0,0,0,0,0}; + if (ref_base >= 0) { + call->ori_ref = ref4 = seq_nt16_int[ref_base]; + if (ref4 > 4) ref4 = 4; + } else call->ori_ref = -1, ref4 = 0; + + // calculate qsum, this is done by summing normalized qsum across all samples, + // to account for differences in coverage + for (i = 0; i < n; ++i) + { + float sum = 0; + for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; + if ( sum ) + for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; + } + + // sort qsum in ascending order (insertion sort) + float *ptr[5], *tmp; + for (i=0; i<5; i++) ptr[i] = &qsum[i]; + for (i=1; i<4; i++) + for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--) + tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp; + + // Set the reference allele and alternative allele(s) + for (i=0; i<5; i++) call->a[i] = -1; + for (i=0; i<5; i++) call->qsum[i] = 0; + call->unseen = -1; + call->a[0] = ref4; + for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering + { + int ipos = ptr[i] - qsum; // position in sorted qsum array + if ( ipos==ref4 ) + call->qsum[0] = qsum[ipos]; // REF's qsum + else + { + if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup + call->qsum[j] = qsum[ipos]; + call->a[j++] = ipos; + } + } + if (ref_base >= 0) + { + // for SNPs, find the "unseen" base + if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) + call->unseen = j, call->a[j++] = ptr[i] - qsum; + call->n_alleles = j; + } + else + { + call->n_alleles = j; + if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything + } + /* + * Set the phread likelihood array (call->PL) This array is 15 entries long + * for each sample because that is size of an upper or lower triangle of a + * worst case 5x5 matrix of possible genotypes. This worst case matrix will + * occur when all 4 possible alleles are present and the reference allele + * is unknown. The sides of the matrix will correspond to the reference + * allele (if known) followed by the alleles present in descending order of + * quality sum + */ + { + int x, g[15], z; + double sum_min = 0.; + x = call->n_alleles * (call->n_alleles + 1) / 2; + // get the possible genotypes + // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix + for (i = z = 0; i < call->n_alleles; ++i) { + for (j = 0; j <= i; ++j) { + g[z++] = call->a[j] * 5 + call->a[i]; + } + } + // for each sample calculate the PL + for (i = 0; i < n; ++i) + { + int32_t *PL = call->PL + x * i; + const bcf_callret1_t *r = calls + i; + float min = FLT_MAX; + for (j = 0; j < x; ++j) { + if (min > r->p[g[j]]) min = r->p[g[j]]; + } + sum_min += min; + for (j = 0; j < x; ++j) { + int y; + y = (int)(r->p[g[j]] - min + .499); + if (y > 255) y = 255; + PL[j] = y; + } + } + if ( call->DP4 ) + { + for (i=0; iDP4[4*i] = calls[i].anno[0]; + call->DP4[4*i+1] = calls[i].anno[1]; + call->DP4[4*i+2] = calls[i].anno[2]; + call->DP4[4*i+3] = calls[i].anno[3]; + } + } + if ( call->ADF ) + { + assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well + + // reorder ADR,ADF to match the allele ordering at this site + int32_t tmp[B2B_MAX_ALLELES]; + int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES; + int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES; + int32_t *adr_tot = call->ADR; // the first bin stores total counts per site + int32_t *adf_tot = call->ADF; + for (i=0; in_alleles; j++) + { + tmp[j] = adr[ call->a[j] ]; + adr_tot[j] += tmp[j]; + } + for (j=0; jn_alleles; j++) adr_out[j] = tmp[j]; + for (j=0; jn_alleles; j++) + { + tmp[j] = adf[ call->a[j] ]; + adf_tot[j] += tmp[j]; + } + for (j=0; jn_alleles; j++) adf_out[j] = tmp[j]; + adf_out += call->n_alleles; + adr_out += call->n_alleles; + adr += B2B_MAX_ALLELES; + adf += B2B_MAX_ALLELES; + } + } + +// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); + call->shift = (int)(sum_min + .499); + } + // combine annotations + memset(call->anno, 0, 16 * sizeof(double)); + call->ori_depth = 0; + call->depth = 0; + call->mq0 = 0; + for (i = 0; i < n; ++i) { + call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3]; + call->ori_depth += calls[i].ori_depth; + call->mq0 += calls[i].mq0; + for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; + } + + calc_SegBias(calls, call); + + // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); + // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); + // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); + + call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); + call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); + call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); + call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); + +#if CDF_MWU_TESTS + call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); + call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); + call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); + call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); +#endif + + call->vdb = calc_vdb(bca->alt_pos, bca->npos); + + return 0; +} + +int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) +{ + extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); + int i, j, nals = 1; + + bcf_hdr_t *hdr = bc->bcf_hdr; + rec->rid = bc->tid; + rec->pos = bc->pos; + rec->qual = 0; + + bc->tmp.l = 0; + if (bc->ori_ref < 0) // indel + { + // REF + kputc(ref[bc->pos], &bc->tmp); + for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); + + // ALT + for (i=1; i<4; i++) + { + if (bc->a[i] < 0) break; + kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp); + + if (bca->indel_types[bc->a[i]] < 0) { // deletion + for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j) + kputc(ref[bc->pos+1+j], &bc->tmp); + } else { // insertion; cannot be a reference unless a bug + char *inscns = &bca->inscns[bc->a[i] * bca->maxins]; + for (j = 0; j < bca->indel_types[bc->a[i]]; ++j) + kputc("ACGTN"[(int)inscns[j]], &bc->tmp); + for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); + } + nals++; + } + } + else // SNP + { + kputc("ACGTN"[bc->ori_ref], &bc->tmp); + for (i=1; i<5; i++) + { + if (bc->a[i] < 0) break; + kputc(',', &bc->tmp); + if ( bc->unseen==i ) kputs("<*>", &bc->tmp); + else kputc("ACGT"[bc->a[i]], &bc->tmp); + nals++; + } + } + bcf_update_alleles_str(hdr, rec, bc->tmp.s); + + bc->tmp.l = 0; + + // INFO + if (bc->ori_ref < 0) + { + bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); + bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); + bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); + } + bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); + if ( fmt_flag&B2B_INFO_ADF ) + bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele); + if ( fmt_flag&B2B_INFO_ADR ) + bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele); + if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) ) + { + for (i=0; in_allele; i++) bc->ADF[i] += bc->ADR[i]; + if ( fmt_flag&B2B_INFO_AD ) + bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele); + if ( fmt_flag&B2B_INFO_DPR ) + bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); + } + + float tmpf[16]; + for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; + bcf_update_info_float(hdr, rec, "I16", tmpf, 16); + bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals); + + if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); + if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); + if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); +#if CDF_MWU_TESTS + if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); + if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); + if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); + if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); +#endif + tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; + bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); + + // FORMAT + rec->n_sample = bc->n; + bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample); + if ( fmt_flag&B2B_FMT_DP ) + { + int32_t *ptr = (int32_t*) bc->fmt_arr; + for (i=0; in; i++) + ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3]; + bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample); + } + if ( fmt_flag&B2B_FMT_DV ) + { + int32_t *ptr = (int32_t*) bc->fmt_arr; + for (i=0; in; i++) + ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3]; + bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample); + } + if ( fmt_flag&B2B_FMT_SP ) + { + int32_t *ptr = (int32_t*) bc->fmt_arr; + for (i=0; in; i++) + { + int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3]; + if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 ) + ptr[i] = 0; + else + { + double left, right, two; + kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two); + int32_t x = (int)(-4.343 * log(two) + .499); + if (x > 255) x = 255; + ptr[i] = x; + } + } + bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample); + } + if ( fmt_flag&B2B_FMT_DP4 ) + bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4); + if ( fmt_flag&B2B_FMT_ADF ) + bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + if ( fmt_flag&B2B_FMT_ADR ) + bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) ) + { + for (i=0; in_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i]; + if ( fmt_flag&B2B_FMT_AD ) + bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + if ( fmt_flag&B2B_FMT_DPR ) + bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + } + + return 0; +} diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c new file mode 100644 index 0000000..5a1a443 --- /dev/null +++ b/bcftools/bam2bcf.c.pysam.c @@ -0,0 +1,859 @@ +#include "pysam.h" + +/* bam2bcf.c -- variant calling. + + Copyright (C) 2010-2012 Broad Institute. + Copyright (C) 2012-2014 Genome Research Ltd. + + Author: Heng Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bam2bcf.h" + +extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); + +#define CALL_DEFTHETA 0.83 +#define DEF_MAPQ 20 + +#define CAP_DIST 25 + +bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) +{ + bcf_callaux_t *bca; + if (theta <= 0.) theta = CALL_DEFTHETA; + bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t)); + bca->capQ = 60; + bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; + bca->min_baseQ = min_baseQ; + bca->e = errmod_init(1. - theta); + bca->min_frac = 0.002; + bca->min_support = 1; + bca->per_sample_flt = 0; + bca->npos = 100; + bca->ref_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->alt_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->nqual = 60; + bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int)); + bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int)); + bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int)); + bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int)); + return bca; +} + +void bcf_call_destroy(bcf_callaux_t *bca) +{ + if (bca == 0) return; + errmod_destroy(bca->e); + if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } + free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); + free(bca->fwd_mqs); free(bca->rev_mqs); + bca->nqual = 0; + free(bca->bases); free(bca->inscns); free(bca); +} + +// position in the sequence with respect to the aligned part of the read +static int get_position(const bam_pileup1_t *p, int *len) +{ + int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; + for (icig=0; icigb->core.n_cigar; icig++) + { + int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; + int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) + { + n_tot_bases += ncig; + iread += ncig; + continue; + } + if ( cig==BAM_CINS ) + { + n_tot_bases += ncig; + iread += ncig; + continue; + } + if ( cig==BAM_CSOFT_CLIP ) + { + iread += ncig; + if ( iread<=p->qpos ) edist -= ncig; + continue; + } + if ( cig==BAM_CDEL ) continue; + if ( cig==BAM_CHARD_CLIP ) continue; + if ( cig==BAM_CPAD ) continue; + if ( cig==BAM_CREF_SKIP ) continue; + fprintf(pysam_stderr,"todo: cigar %d\n", cig); + assert(0); + } + *len = n_tot_bases; + return edist; +} + +void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) +{ + memset(bca->ref_pos,0,sizeof(int)*bca->npos); + memset(bca->alt_pos,0,sizeof(int)*bca->npos); + memset(bca->ref_mq,0,sizeof(int)*bca->nqual); + memset(bca->alt_mq,0,sizeof(int)*bca->nqual); + memset(bca->ref_bq,0,sizeof(int)*bca->nqual); + memset(bca->alt_bq,0,sizeof(int)*bca->nqual); + memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); + memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); + if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); + if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); +} + +/* + Notes: + - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies + which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. + Later it's used for multiallelic calling by bcftools -m + - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. + */ +/* + * This function is called once for each sample. + * _n is number of pilesups pl contributing reads to this sample + * pl is pointer to array of _n pileups (one pileup per read) + * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. + * bca is the settings to perform calls across all samples + * r is the returned value of the call + */ +int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) +{ + int i, n, ref4, is_indel, ori_depth = 0; + + // clean from previous run + r->ori_depth = 0; + r->mq0 = 0; + memset(r->qsum,0,sizeof(float)*4); + memset(r->anno,0,sizeof(double)*16); + memset(r->p,0,sizeof(float)*25); + + if (ref_base >= 0) { + ref4 = seq_nt16_int[ref_base]; + is_indel = 0; + } else ref4 = 4, is_indel = 1; + if (_n == 0) return -1; + // enlarge the bases array if necessary + if (bca->max_bases < _n) { + bca->max_bases = _n; + kroundup32(bca->max_bases); + bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); + } + // fill the bases array + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; + // set base + if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; + ++ori_depth; + mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 + if ( !mapQ ) r->mq0++; + baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality + seqQ = is_indel? (p->aux>>8&0xff) : 99; + if (q < bca->min_baseQ) continue; + if (q > seqQ) q = seqQ; + mapQ = mapQ < bca->capQ? mapQ : bca->capQ; + if (q > mapQ) q = mapQ; + if (q > 63) q = 63; + if (q < 4) q = 4; // MQ=0 reads count as BQ=4 + if (!is_indel) { + b = bam_seqi(bam_get_seq(p->b), p->qpos); // base + b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base + is_diff = (ref4 < 4 && b == ref4)? 0 : 1; + } else { + b = p->aux>>16&0x3f; + is_diff = (b != 0); + } + bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; + // collect annotations + if (b < 4) + { + r->qsum[b] += q; + if ( r->ADF ) + { + if ( bam_is_rev(p->b) ) + r->ADR[b]++; + else + r->ADF[b]++; + } + } + ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)]; + min_dist = p->b->core.l_qseq - 1 - p->qpos; + if (min_dist > p->qpos) min_dist = p->qpos; + if (min_dist > CAP_DIST) min_dist = CAP_DIST; + r->anno[1<<2|is_diff<<1|0] += baseQ; + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; + r->anno[2<<2|is_diff<<1|0] += mapQ; + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; + r->anno[3<<2|is_diff<<1|0] += min_dist; + r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; + + // collect for bias tests + if ( baseQ > 59 ) baseQ = 59; + if ( mapQ > 59 ) mapQ = 59; + int len, pos = get_position(p, &len); + int epos = (double)pos/(len+1) * bca->npos; + int ibq = baseQ/60. * bca->nqual; + int imq = mapQ/60. * bca->nqual; + if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; + else bca->fwd_mqs[imq]++; + if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) + { + bca->ref_pos[epos]++; + bca->ref_bq[ibq]++; + bca->ref_mq[imq]++; + } + else + { + bca->alt_pos[epos]++; + bca->alt_bq[ibq]++; + bca->alt_mq[imq]++; + } + } + r->ori_depth = ori_depth; + // glfgen + errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype + return n; +} + + +/* + * calc_vdb() - returns value between zero (most biased) and one (no bias) + * on success, or HUGE_VAL when VDB cannot be calculated because + * of insufficient depth (<2x) + * + * Variant Distance Bias tests if the variant bases are positioned within the + * reads with sufficient randomness. Unlike other tests, it looks only at + * variant reads and therefore gives different kind of information than Read + * Position Bias for instance. VDB was developed for detecting artefacts in + * RNA-seq calls where reads from spliced transcripts span splice site + * boundaries. The current implementation differs somewhat from the original + * version described in supplementary material of PMID:22524474, but the idea + * remains the same. (Here the random variable tested is the average distance + * from the averaged position, not the average pairwise distance.) + * + * For coverage of 2x, the calculation is exact but is approximated for the + * rest. The result is most accurate between 4-200x. For 3x or >200x, the + * reported values are slightly more favourable than those of a true random + * distribution. + */ +double calc_vdb(int *pos, int npos) +{ + // Note well: the parameters were obtained by fitting to simulated data of + // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen(). + const int readlen = 100; + assert( npos==readlen ); + + #define nparam 15 + const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5}, + {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8}, + {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7}, + {200,0.7,23.7} }; + + int i, dp = 0; + float mean_pos = 0, mean_diff = 0; + for (i=0; i=200 ) + i = nparam; // shortcut for big depths + else + { + for (i=0; i=dp ) break; + } + float pshift, pscale; + if ( i==nparam ) + { + // the depth is too high, go with 200x + pscale = param[nparam-1][1]; + pshift = param[nparam-1][2]; + } + else if ( i>0 && param[i][0]!=dp ) + { + // linear interpolation of parameters + pscale = (param[i-1][1] + param[i][1])*0.5; + pshift = (param[i-1][2] + param[i][2])*0.5; + } + else + { + pscale = param[i][1]; + pshift = param[i][2]; + } + return 0.5*kf_erfc(-(mean_diff-pshift)*pscale); +} + +double calc_chisq_bias(int *a, int *b, int n) +{ + int na = 0, nb = 0, i, ndf = n; + for (i=0; i= 2 && m >= 2); + + return (n < 8 && m < 8 && U < 50) + ? mw[n-2][m-2][U] + : mann_whitney_1947_(n,m,U); +} + +double mann_whitney_1947_cdf(int n, int m, int U) +{ + int i; + double sum = 0; + for (i=0; i<=U; i++) + sum += mann_whitney_1947(n,m,i); + return sum; +} + +double calc_mwu_bias_cdf(int *a, int *b, int n) +{ + int na = 0, nb = 0, i; + double U = 0, ties = 0; + for (i=0; i=8 && nb>=8 and reasonable if na<8 or nb<8 + if ( na>=8 || nb>=8 ) + { + double mean = ((double)na*nb)*0.5; + // Correction for ties: + // double N = na+nb; + // double var2 = (N*N-1)*N-ties; + // if ( var2==0 ) return 1.0; + // var2 *= ((double)na*nb)/N/(N-1)/12.0; + // No correction for ties: + double var2 = ((double)na*nb)*(na+nb+1)/12.0; + double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1) + return 2.0 - kf_erfc(z); // which is 1 + erf(z) + } + + // Exact calculation + double pval = 2*mann_whitney_1947_cdf(na,nb,U_min); + return pval>1 ? 1 : pval; +} + +double calc_mwu_bias(int *a, int *b, int n) +{ + int na = 0, nb = 0, i; + double U = 0, ties = 0; + for (i=0; imean ? (2.0*mean-U)/mean : U/mean; + } + // Correction for ties: + // double N = na+nb; + // double var2 = (N*N-1)*N-ties; + // if ( var2==0 ) return 1.0; + // var2 *= ((double)na*nb)/N/(N-1)/12.0; + // No correction for ties: + double var2 = ((double)na*nb)*(na+nb+1)/12.0; + if ( na>=8 || nb>=8 ) + { + // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8 + return exp(-0.5*(U-mean)*(U-mean)/var2); + } + + // Exact calculation + return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); +} + +static inline double logsumexp2(double a, double b) +{ + if ( a>b ) + return log(1 + exp(b-a)) + a; + else + return log(1 + exp(a-b)) + b; +} + +void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) +{ + call->seg_bias = HUGE_VAL; + if ( !bcr ) return; + + int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads + if ( !nr ) return; + + int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth + double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population + if ( M>call->n ) M = call->n; // clamp M at the number of samples + else if ( M==0 ) M = 1; + double f = M / 2. / call->n; // allele frequency + double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson) + double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson) + double sum = 0; + const double log2 = log(2.0); + + // fprintf(pysam_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); + int i; + for (i=0; in; i++) + { + int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads + double tmp; + if ( oi ) + { + // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow + tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q); + tmp += log(f) + oi*log(q/p) - q + p; + } + else + tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; + sum += tmp; + // fprintf(pysam_stderr,"oi=%d %e\n", oi,tmp); + } + call->seg_bias = sum; +} + +/** + * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles + * @n: number of samples + * @calls: each sample's calls + * @bca: auxiliary data structure for holding temporary values + * @ref_base: the reference base + * @call: filled with the annotations + * + * Combines calls across the various samples being studied + * 1. For each allele at each base across all samples the quality is summed so + * you end up with a set of quality sums for each allele present 2. The quality + * sums are sorted. + * 3. Using the sorted quality sums we now create the allele ordering array + * A\subN. This is done by doing the following: + * a) If the reference allele is known it always comes first, otherwise N + * comes first. + * b) Then the rest of the alleles are output in descending order of quality + * sum (which we already know the qsum array was sorted). Any allelles with + * qsum 0 will be excluded. + * 4. Using the allele ordering array we create the genotype ordering array. + * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1 + * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4 + * 5. The genotype ordering array is then used to extract data from the error + * model 5*5 matrix and is used to produce a Phread likelihood array for each + * sample. + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) +{ + int ref4, i, j; + float qsum[5] = {0,0,0,0,0}; + if (ref_base >= 0) { + call->ori_ref = ref4 = seq_nt16_int[ref_base]; + if (ref4 > 4) ref4 = 4; + } else call->ori_ref = -1, ref4 = 0; + + // calculate qsum, this is done by summing normalized qsum across all samples, + // to account for differences in coverage + for (i = 0; i < n; ++i) + { + float sum = 0; + for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; + if ( sum ) + for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; + } + + // sort qsum in ascending order (insertion sort) + float *ptr[5], *tmp; + for (i=0; i<5; i++) ptr[i] = &qsum[i]; + for (i=1; i<4; i++) + for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--) + tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp; + + // Set the reference allele and alternative allele(s) + for (i=0; i<5; i++) call->a[i] = -1; + for (i=0; i<5; i++) call->qsum[i] = 0; + call->unseen = -1; + call->a[0] = ref4; + for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering + { + int ipos = ptr[i] - qsum; // position in sorted qsum array + if ( ipos==ref4 ) + call->qsum[0] = qsum[ipos]; // REF's qsum + else + { + if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup + call->qsum[j] = qsum[ipos]; + call->a[j++] = ipos; + } + } + if (ref_base >= 0) + { + // for SNPs, find the "unseen" base + if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) + call->unseen = j, call->a[j++] = ptr[i] - qsum; + call->n_alleles = j; + } + else + { + call->n_alleles = j; + if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything + } + /* + * Set the phread likelihood array (call->PL) This array is 15 entries long + * for each sample because that is size of an upper or lower triangle of a + * worst case 5x5 matrix of possible genotypes. This worst case matrix will + * occur when all 4 possible alleles are present and the reference allele + * is unknown. The sides of the matrix will correspond to the reference + * allele (if known) followed by the alleles present in descending order of + * quality sum + */ + { + int x, g[15], z; + double sum_min = 0.; + x = call->n_alleles * (call->n_alleles + 1) / 2; + // get the possible genotypes + // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix + for (i = z = 0; i < call->n_alleles; ++i) { + for (j = 0; j <= i; ++j) { + g[z++] = call->a[j] * 5 + call->a[i]; + } + } + // for each sample calculate the PL + for (i = 0; i < n; ++i) + { + int32_t *PL = call->PL + x * i; + const bcf_callret1_t *r = calls + i; + float min = FLT_MAX; + for (j = 0; j < x; ++j) { + if (min > r->p[g[j]]) min = r->p[g[j]]; + } + sum_min += min; + for (j = 0; j < x; ++j) { + int y; + y = (int)(r->p[g[j]] - min + .499); + if (y > 255) y = 255; + PL[j] = y; + } + } + if ( call->DP4 ) + { + for (i=0; iDP4[4*i] = calls[i].anno[0]; + call->DP4[4*i+1] = calls[i].anno[1]; + call->DP4[4*i+2] = calls[i].anno[2]; + call->DP4[4*i+3] = calls[i].anno[3]; + } + } + if ( call->ADF ) + { + assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well + + // reorder ADR,ADF to match the allele ordering at this site + int32_t tmp[B2B_MAX_ALLELES]; + int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES; + int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES; + int32_t *adr_tot = call->ADR; // the first bin stores total counts per site + int32_t *adf_tot = call->ADF; + for (i=0; in_alleles; j++) + { + tmp[j] = adr[ call->a[j] ]; + adr_tot[j] += tmp[j]; + } + for (j=0; jn_alleles; j++) adr_out[j] = tmp[j]; + for (j=0; jn_alleles; j++) + { + tmp[j] = adf[ call->a[j] ]; + adf_tot[j] += tmp[j]; + } + for (j=0; jn_alleles; j++) adf_out[j] = tmp[j]; + adf_out += call->n_alleles; + adr_out += call->n_alleles; + adr += B2B_MAX_ALLELES; + adf += B2B_MAX_ALLELES; + } + } + +// if (ref_base < 0) fprintf(pysam_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); + call->shift = (int)(sum_min + .499); + } + // combine annotations + memset(call->anno, 0, 16 * sizeof(double)); + call->ori_depth = 0; + call->depth = 0; + call->mq0 = 0; + for (i = 0; i < n; ++i) { + call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3]; + call->ori_depth += calls[i].ori_depth; + call->mq0 += calls[i].mq0; + for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; + } + + calc_SegBias(calls, call); + + // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); + // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); + // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); + + call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); + call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); + call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); + call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); + +#if CDF_MWU_TESTS + call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); + call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); + call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); + call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); +#endif + + call->vdb = calc_vdb(bca->alt_pos, bca->npos); + + return 0; +} + +int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) +{ + extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); + int i, j, nals = 1; + + bcf_hdr_t *hdr = bc->bcf_hdr; + rec->rid = bc->tid; + rec->pos = bc->pos; + rec->qual = 0; + + bc->tmp.l = 0; + if (bc->ori_ref < 0) // indel + { + // REF + kputc(ref[bc->pos], &bc->tmp); + for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); + + // ALT + for (i=1; i<4; i++) + { + if (bc->a[i] < 0) break; + kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp); + + if (bca->indel_types[bc->a[i]] < 0) { // deletion + for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j) + kputc(ref[bc->pos+1+j], &bc->tmp); + } else { // insertion; cannot be a reference unless a bug + char *inscns = &bca->inscns[bc->a[i] * bca->maxins]; + for (j = 0; j < bca->indel_types[bc->a[i]]; ++j) + kputc("ACGTN"[(int)inscns[j]], &bc->tmp); + for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); + } + nals++; + } + } + else // SNP + { + kputc("ACGTN"[bc->ori_ref], &bc->tmp); + for (i=1; i<5; i++) + { + if (bc->a[i] < 0) break; + kputc(',', &bc->tmp); + if ( bc->unseen==i ) kputs("<*>", &bc->tmp); + else kputc("ACGT"[bc->a[i]], &bc->tmp); + nals++; + } + } + bcf_update_alleles_str(hdr, rec, bc->tmp.s); + + bc->tmp.l = 0; + + // INFO + if (bc->ori_ref < 0) + { + bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); + bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); + bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); + } + bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); + if ( fmt_flag&B2B_INFO_ADF ) + bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele); + if ( fmt_flag&B2B_INFO_ADR ) + bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele); + if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) ) + { + for (i=0; in_allele; i++) bc->ADF[i] += bc->ADR[i]; + if ( fmt_flag&B2B_INFO_AD ) + bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele); + if ( fmt_flag&B2B_INFO_DPR ) + bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); + } + + float tmpf[16]; + for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; + bcf_update_info_float(hdr, rec, "I16", tmpf, 16); + bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals); + + if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); + if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); + if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); +#if CDF_MWU_TESTS + if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); + if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); + if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); + if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); +#endif + tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; + bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); + + // FORMAT + rec->n_sample = bc->n; + bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample); + if ( fmt_flag&B2B_FMT_DP ) + { + int32_t *ptr = (int32_t*) bc->fmt_arr; + for (i=0; in; i++) + ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3]; + bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample); + } + if ( fmt_flag&B2B_FMT_DV ) + { + int32_t *ptr = (int32_t*) bc->fmt_arr; + for (i=0; in; i++) + ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3]; + bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample); + } + if ( fmt_flag&B2B_FMT_SP ) + { + int32_t *ptr = (int32_t*) bc->fmt_arr; + for (i=0; in; i++) + { + int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3]; + if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 ) + ptr[i] = 0; + else + { + double left, right, two; + kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two); + int32_t x = (int)(-4.343 * log(two) + .499); + if (x > 255) x = 255; + ptr[i] = x; + } + } + bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample); + } + if ( fmt_flag&B2B_FMT_DP4 ) + bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4); + if ( fmt_flag&B2B_FMT_ADF ) + bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + if ( fmt_flag&B2B_FMT_ADR ) + bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) ) + { + for (i=0; in_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i]; + if ( fmt_flag&B2B_FMT_AD ) + bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + if ( fmt_flag&B2B_FMT_DPR ) + bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); + } + + return 0; +} diff --git a/bcftools/bam2bcf.h b/bcftools/bam2bcf.h new file mode 100644 index 0000000..f81f9cf --- /dev/null +++ b/bcftools/bam2bcf.h @@ -0,0 +1,138 @@ +/* bam2bcf.h -- variant calling. + + Copyright (C) 2010-2012 Broad Institute. + Copyright (C) 2012-2014,2016 Genome Research Ltd. + + Author: Heng Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef BAM2BCF_H +#define BAM2BCF_H + +#include +#include +#include + +/** + * A simplified version of Mann-Whitney U-test is calculated + * by default (no CDF) because it is faster and seems to work + * better in machine learning filtering. When enabled by setting + * CDF_MWU_TESTS, additional annotations will appear on mpileup's + * output (RPB2 in addition to RPB, etc.). + */ +#ifndef CDF_MWU_TESTS +#define CDF_MWU_TESTS 0 +#endif + +#define B2B_INDEL_NULL 10000 + +#define B2B_FMT_DP (1<<0) +#define B2B_FMT_SP (1<<1) +#define B2B_FMT_DV (1<<2) +#define B2B_FMT_DP4 (1<<3) +#define B2B_FMT_DPR (1<<4) +#define B2B_INFO_DPR (1<<5) +#define B2B_FMT_AD (1<<6) +#define B2B_FMT_ADF (1<<7) +#define B2B_FMT_ADR (1<<8) +#define B2B_INFO_AD (1<<9) +#define B2B_INFO_ADF (1<<10) +#define B2B_INFO_ADR (1<<11) + +#define B2B_MAX_ALLELES 5 + +typedef struct __bcf_callaux_t { + int capQ, min_baseQ; + int openQ, extQ, tandemQ; // for indels + uint32_t min_support, max_support; // for collecting indel candidates + double min_frac; // for collecting indel candidates + float max_frac; // for collecting indel candidates + int per_sample_flt; // indel filtering strategy + int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests + // for internal uses + int max_bases; + int indel_types[4]; // indel lengths + int maxins, indelreg; + int read_len; + char *inscns; + uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types) + errmod_t *e; + void *rghash; +} bcf_callaux_t; + +typedef struct { + uint32_t ori_depth; + unsigned int mq0; + int32_t *ADF, *ADR; + float qsum[4]; + // The fields are: + // depth fwd .. ref (0) and non-ref (2) + // depth rev .. ref (1) and non-ref (3) + // baseQ .. ref (4) and non-ref (6) + // baseQ^2 .. ref (5) and non-ref (7) + // mapQ .. ref (8) and non-ref (10) + // mapQ^2 .. ref (9) and non-ref (11) + // minDist .. ref (12) and non-ref (14) + // minDist^2 .. ref (13) and non-ref (15) + // Note that this probably needs a more thorough fix: int types in + // bcf_call_t do overflow with high-coverage data, such as exomes, and + // BCFv2 supports only floats which may not suffice. + double anno[16]; + float p[25]; // phred-scaled likelihood of each genotype +} bcf_callret1_t; + +typedef struct { + int tid, pos; + bcf_hdr_t *bcf_hdr; + int a[5]; // alleles: ref, alt, alt2, alt3 + float qsum[5]; // for the QS tag + int n, n_alleles, shift, ori_ref, unseen; + int n_supp; // number of supporting non-reference reads + double anno[16]; + unsigned int depth, ori_depth, mq0; + int32_t *PL, *DP4, *ADR, *ADF; + uint8_t *fmt_arr; + float vdb; // variant distance bias + float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; +#if CDF_MWU_TESTS + float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf; +#endif + float seg_bias; + kstring_t tmp; +} bcf_call_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); + void bcf_call_destroy(bcf_callaux_t *bca); + int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); + int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); + int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, + const bcf_callaux_t *bca, const char *ref); + int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); + void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bcftools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c new file mode 100644 index 0000000..52837b5 --- /dev/null +++ b/bcftools/bam2bcf_indel.c @@ -0,0 +1,470 @@ +/* bam2bcf_indel.c -- indel caller. + + Copyright (C) 2010, 2011 Broad Institute. + Copyright (C) 2012-2014,2016 Genome Research Ltd. + + Author: Heng Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include "bam2bcf.h" + +#include +KSORT_INIT_GENERIC(uint32_t) + +#define MINUS_CONST 0x10000000 +#define INDEL_WINDOW_SIZE 50 + +static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +{ + int k, x = c->pos, y = 0, last_y = 0; + *_tpos = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int l = cigar[k] >> BAM_CIGAR_SHIFT; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (c->pos > tpos) return y; + if (x + l > tpos) { + *_tpos = tpos; + return y + (tpos - x); + } + x += l; y += l; + last_y = y; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + if (x + l > tpos) { + *_tpos = is_left? x : x + l; + return y; + } + x += l; + } + } + *_tpos = x; + return last_y; +} +// FIXME: check if the inserted sequence is consistent with the homopolymer run +// l is the relative gap length and l_run is the length of the homopolymer on the reference +static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) +{ + int q, qh; + q = bca->openQ + bca->extQ * (abs(l) - 1); + qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; + return q < qh? q : qh; +} + +static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +{ + int i, j, max = 0, max_i = pos, score = 0; + l = abs(l); + for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { + if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; + else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1; + if (score < 0) break; + if (max < score) max = score, max_i = i; + } + return max_i - pos; +} + +/* + notes: + - n .. number of samples + - the routine sets bam_pileup1_t.aux of each read as follows: + - 6: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff + */ +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +{ + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; + int N, K, l_run, ref_type, n_alt; + char *inscns = 0, *ref2, *query, **ref_sample; + if (ref == 0 || bca == 0) return -1; + + // determine if there is a gap + for (s = N = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) + if (plp[s][i].indel != 0) break; + if (i < n_plp[s]) break; + } + if (s == n) return -1; // there is no indel at this position. + for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads + { // find out how many types of indels are present + bca->max_support = bca->max_frac = 0; + int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + aux = (uint32_t*) calloc(N + 1, 4); + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + double frac = (double)na/nt; + if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; + n_alt += na; + n_tot += nt; + } + // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), + // check the number of N's in the sequence and skip places where half or more reference bases are Ns. + int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } + + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + // Taking totals makes it hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); return -1; + } + if (n_types >= 64) { + free(aux); + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) + fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + return -1; + } + types = (int*)calloc(n_types, sizeof(int)); + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + ref_type = t; // the index of the reference type (0) + } + { // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + } + /* The following block fixes a long-existing flaw in the INDEL + * calling model: the interference of nearby SNPs. However, it also + * reduces the power because sometimes, substitutions caused by + * indels are not distinguishable from true mutations. Multiple + * sequence realignment helps to increase the power. + * + * Masks mismatches present in at least 70% of the reads with 'N'. + */ + { // construct per-sample consensus + int L = right - left + 1, max_i, max2_i; + uint32_t *cns, max, max2; + char *ref0, *r; + ref_sample = (char**) calloc(n, sizeof(char*)); + cns = (uint32_t*) calloc(L, 4); + ref0 = (char*) calloc(L, 1); + for (i = 0; i < right - left; ++i) + ref0[i] = seq_nt16_table[(int)ref[i+left]]; + for (s = 0; s < n; ++s) { + r = ref_sample[s] = (char*) calloc(L, 1); + memset(cns, 0, sizeof(int) * L); + // collect ref and non-ref counts + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int x = b->core.pos, y = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) + if (x + j >= left && x + j < right) + cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } + // determine the consensus + for (i = 0; i < right - left; ++i) r[i] = ref0[i]; + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; + } + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); + } + free(ref0); free(cns); + } + { // the length of the homopolymer run around the current position + int c = seq_nt16_table[(int)ref[pos + 1]]; + if (c == 15) l_run = 1; + else { + for (i = pos + 2; ref[i]; ++i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + } + // construct the consensus sequence + max_ins = types[n_types - 1]; // max_ins is at least 0 + if (max_ins > 0) { + int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int)); + // count the number of occurrences of each base at each position for each type of insertion + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam_get_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; + } + } + } + } + } + } + // use the majority rule to construct the consensus + inscns = (char*) calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max? max_k : 4; + if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's + } + } + free(inscns_aux); + } + // compute the likelihood given each type of indel for each read + max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); + ref2 = (char*) calloc(max_ref2, 1); + query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); + score1 = (int*) calloc(N * n_types, sizeof(int)); + score2 = (int*) calloc(N * n_types, sizeof(int)); + bca->indelreg = 0; + for (t = 0; t < n_types; ++t) { + int l, ir; + probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; + apf1.bw = apf2.bw = abs(types[t]) + 3; + // compute indelreg + if (types[t] == 0) ir = 0; + else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else ir = est_indelreg(pos, ref, -types[t], 0); + if (ir > bca->indelreg) bca->indelreg = ir; +// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); + // realignment + for (s = K = 0; s < n; ++s) { + // write ref2 + for (k = 0, j = left; j <= pos; ++j) + ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; + if (types[t] <= 0) j += -types[t]; + else for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) + ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; + for (; k < max_ref2; ++k) ref2[k] = 4; + if (j < right) right = j; + // align each read to ref2 + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int qbeg, qend, tbeg, tend, sc, kk; + uint8_t *seq = bam_get_seq(p->b); + uint32_t *cigar = bam_get_cigar(p->b); + if (p->b->core.flag&4) continue; // unmapped reads + // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; + if (kk < p->b->core.n_cigar) continue; + // FIXME: the following skips soft clips, but using them may be more sensitive. + // determine the start and end of sequences for alignment + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); + if (types[t] < 0) { + int l = -types[t]; + tbeg = tbeg - l > left? tbeg - l : left; + } + // write the query sequence + for (l = qbeg; l < qend; ++l) + query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; + { // do realignment; this is the bottleneck + const uint8_t *qual = bam_get_qual(p->b), *bq; + uint8_t *qq; + qq = (uint8_t*) calloc(qend - qbeg, 1); + bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + if (bq) ++bq; // skip type + for (l = qbeg; l < qend; ++l) { + qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; + if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; + } + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); + l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below + if (l > 255) l = 255; + score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; + if (sc > 5) { + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); + l = (int)(100. * sc / (qend - qbeg) + .499); + if (l > 255) l = 255; + score2[K*n_types + t] = sc<<8 | l; + } + free(qq); + } +/* + for (l = 0; l < tend - tbeg + abs(types[t]); ++l) + fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); + fputc('\n', stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); + fputc('\n', stderr); + fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); +*/ + } + } + } + free(ref2); free(query); + { // compute indelQ + int sc_a[16], sumq_a[16]; + int tmp, *sc = sc_a, *sumq = sumq_a; + if (n_types > 16) { + sc = (int *)malloc(n_types * sizeof(int)); + sumq = (int *)malloc(n_types * sizeof(int)); + } + memset(sumq, 0, n_types * sizeof(int)); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ1 = (sc[1]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ1 = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + tmp = sc[0]>>6 & 0xff; + indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ + sct = &score2[K*n_types]; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + if ((sc[0]&0x3f) == ref_type) { + indelQ2 = (sc[1]>>14) - (sc[0]>>14); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ2 = (sc[t]>>14) - (sc[0]>>14); + } + tmp = sc[0]>>6 & 0xff; + indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); + // pick the smaller between indelQ1 and indelQ2 + indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; +// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; + //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + } + } + + if (sc != sc_a) free(sc); + if (sumq != sumq_a) free(sumq); + } + free(score1); free(score2); + // free + for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref_sample); + free(types); free(inscns); + return n_alt > 0? 0 : -1; +} diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c new file mode 100644 index 0000000..0d36841 --- /dev/null +++ b/bcftools/bam2bcf_indel.c.pysam.c @@ -0,0 +1,472 @@ +#include "pysam.h" + +/* bam2bcf_indel.c -- indel caller. + + Copyright (C) 2010, 2011 Broad Institute. + Copyright (C) 2012-2014,2016 Genome Research Ltd. + + Author: Heng Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include "bam2bcf.h" + +#include +KSORT_INIT_GENERIC(uint32_t) + +#define MINUS_CONST 0x10000000 +#define INDEL_WINDOW_SIZE 50 + +static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +{ + int k, x = c->pos, y = 0, last_y = 0; + *_tpos = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int l = cigar[k] >> BAM_CIGAR_SHIFT; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (c->pos > tpos) return y; + if (x + l > tpos) { + *_tpos = tpos; + return y + (tpos - x); + } + x += l; y += l; + last_y = y; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + if (x + l > tpos) { + *_tpos = is_left? x : x + l; + return y; + } + x += l; + } + } + *_tpos = x; + return last_y; +} +// FIXME: check if the inserted sequence is consistent with the homopolymer run +// l is the relative gap length and l_run is the length of the homopolymer on the reference +static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) +{ + int q, qh; + q = bca->openQ + bca->extQ * (abs(l) - 1); + qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; + return q < qh? q : qh; +} + +static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +{ + int i, j, max = 0, max_i = pos, score = 0; + l = abs(l); + for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { + if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; + else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1; + if (score < 0) break; + if (max < score) max = score, max_i = i; + } + return max_i - pos; +} + +/* + notes: + - n .. number of samples + - the routine sets bam_pileup1_t.aux of each read as follows: + - 6: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff + */ +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +{ + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; + int N, K, l_run, ref_type, n_alt; + char *inscns = 0, *ref2, *query, **ref_sample; + if (ref == 0 || bca == 0) return -1; + + // determine if there is a gap + for (s = N = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) + if (plp[s][i].indel != 0) break; + if (i < n_plp[s]) break; + } + if (s == n) return -1; // there is no indel at this position. + for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads + { // find out how many types of indels are present + bca->max_support = bca->max_frac = 0; + int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + aux = (uint32_t*) calloc(N + 1, 4); + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + double frac = (double)na/nt; + if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; + n_alt += na; + n_tot += nt; + } + // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), + // check the number of N's in the sequence and skip places where half or more reference bases are Ns. + int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } + + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + // Taking totals makes it hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); return -1; + } + if (n_types >= 64) { + free(aux); + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) + fprintf(pysam_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + return -1; + } + types = (int*)calloc(n_types, sizeof(int)); + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + ref_type = t; // the index of the reference type (0) + } + { // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + } + /* The following block fixes a long-existing flaw in the INDEL + * calling model: the interference of nearby SNPs. However, it also + * reduces the power because sometimes, substitutions caused by + * indels are not distinguishable from true mutations. Multiple + * sequence realignment helps to increase the power. + * + * Masks mismatches present in at least 70% of the reads with 'N'. + */ + { // construct per-sample consensus + int L = right - left + 1, max_i, max2_i; + uint32_t *cns, max, max2; + char *ref0, *r; + ref_sample = (char**) calloc(n, sizeof(char*)); + cns = (uint32_t*) calloc(L, 4); + ref0 = (char*) calloc(L, 1); + for (i = 0; i < right - left; ++i) + ref0[i] = seq_nt16_table[(int)ref[i+left]]; + for (s = 0; s < n; ++s) { + r = ref_sample[s] = (char*) calloc(L, 1); + memset(cns, 0, sizeof(int) * L); + // collect ref and non-ref counts + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int x = b->core.pos, y = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) + if (x + j >= left && x + j < right) + cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } + // determine the consensus + for (i = 0; i < right - left; ++i) r[i] = ref0[i]; + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; + } + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysam_stderr); fputc('\n', pysam_stderr); + } + free(ref0); free(cns); + } + { // the length of the homopolymer run around the current position + int c = seq_nt16_table[(int)ref[pos + 1]]; + if (c == 15) l_run = 1; + else { + for (i = pos + 2; ref[i]; ++i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + } + // construct the consensus sequence + max_ins = types[n_types - 1]; // max_ins is at least 0 + if (max_ins > 0) { + int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int)); + // count the number of occurrences of each base at each position for each type of insertion + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam_get_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; + } + } + } + } + } + } + // use the majority rule to construct the consensus + inscns = (char*) calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max? max_k : 4; + if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's + } + } + free(inscns_aux); + } + // compute the likelihood given each type of indel for each read + max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); + ref2 = (char*) calloc(max_ref2, 1); + query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); + score1 = (int*) calloc(N * n_types, sizeof(int)); + score2 = (int*) calloc(N * n_types, sizeof(int)); + bca->indelreg = 0; + for (t = 0; t < n_types; ++t) { + int l, ir; + probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; + apf1.bw = apf2.bw = abs(types[t]) + 3; + // compute indelreg + if (types[t] == 0) ir = 0; + else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else ir = est_indelreg(pos, ref, -types[t], 0); + if (ir > bca->indelreg) bca->indelreg = ir; +// fprintf(pysam_stderr, "%d, %d, %d\n", pos, types[t], ir); + // realignment + for (s = K = 0; s < n; ++s) { + // write ref2 + for (k = 0, j = left; j <= pos; ++j) + ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; + if (types[t] <= 0) j += -types[t]; + else for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) + ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; + for (; k < max_ref2; ++k) ref2[k] = 4; + if (j < right) right = j; + // align each read to ref2 + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int qbeg, qend, tbeg, tend, sc, kk; + uint8_t *seq = bam_get_seq(p->b); + uint32_t *cigar = bam_get_cigar(p->b); + if (p->b->core.flag&4) continue; // unmapped reads + // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; + if (kk < p->b->core.n_cigar) continue; + // FIXME: the following skips soft clips, but using them may be more sensitive. + // determine the start and end of sequences for alignment + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); + if (types[t] < 0) { + int l = -types[t]; + tbeg = tbeg - l > left? tbeg - l : left; + } + // write the query sequence + for (l = qbeg; l < qend; ++l) + query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; + { // do realignment; this is the bottleneck + const uint8_t *qual = bam_get_qual(p->b), *bq; + uint8_t *qq; + qq = (uint8_t*) calloc(qend - qbeg, 1); + bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + if (bq) ++bq; // skip type + for (l = qbeg; l < qend; ++l) { + qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; + if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; + } + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); + l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below + if (l > 255) l = 255; + score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; + if (sc > 5) { + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); + l = (int)(100. * sc / (qend - qbeg) + .499); + if (l > 255) l = 255; + score2[K*n_types + t] = sc<<8 | l; + } + free(qq); + } +/* + for (l = 0; l < tend - tbeg + abs(types[t]); ++l) + fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr); + fputc('\n', pysam_stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr); + fputc('\n', pysam_stderr); + fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); +*/ + } + } + } + free(ref2); free(query); + { // compute indelQ + int sc_a[16], sumq_a[16]; + int tmp, *sc = sc_a, *sumq = sumq_a; + if (n_types > 16) { + sc = (int *)malloc(n_types * sizeof(int)); + sumq = (int *)malloc(n_types * sizeof(int)); + } + memset(sumq, 0, n_types * sizeof(int)); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ1 = (sc[1]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ1 = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + tmp = sc[0]>>6 & 0xff; + indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ + sct = &score2[K*n_types]; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + if ((sc[0]&0x3f) == ref_type) { + indelQ2 = (sc[1]>>14) - (sc[0]>>14); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ2 = (sc[t]>>14) - (sc[0]>>14); + } + tmp = sc[0]>>6 & 0xff; + indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); + // pick the smaller between indelQ1 and indelQ2 + indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; +// fprintf(pysam_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; + //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + } + } + + if (sc != sc_a) free(sc); + if (sumq != sumq_a) free(sumq); + } + free(score1); free(score2); + // free + for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref_sample); + free(types); free(inscns); + return n_alt > 0? 0 : -1; +} diff --git a/bcftools/bam_sample.c b/bcftools/bam_sample.c new file mode 100644 index 0000000..66f5729 --- /dev/null +++ b/bcftools/bam_sample.c @@ -0,0 +1,393 @@ +/* bam_sample.c -- group data by sample. + + Copyright (C) 2010, 2011 Broad Institute. + Copyright (C) 2013, 2016 Genome Research Ltd. + + Author: Heng Li , Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include "bam_sample.h" +#include "bcftools.h" + + +typedef struct +{ + char *fname; + void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup + int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample +} +file_t; + +struct _bam_smpl_t +{ + kstring_t tmp; + file_t *files; + int ignore_rg, nsmpl, nfiles; + char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup + void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list + int sample_logic; // the -s/-S logic, 1: include, 0: exclude + void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list + int rg_logic; // the -G logic, 1: include, 0: exclude + void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup +}; + +bam_smpl_t *bam_smpl_init(void) +{ + bam_smpl_t *bsmpl; + bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t)); + bsmpl->name2idx = khash_str2int_init(); + return bsmpl; +} + +void bam_smpl_destroy(bam_smpl_t *bsmpl) +{ + if ( !bsmpl ) return; + if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx); + if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list); + if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list); + int i; + for (i=0; infiles; i++) + { + file_t *file = &bsmpl->files[i]; + if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx); + free(file->fname); + } + free(bsmpl->smpl); + free(bsmpl->files); + free(bsmpl->tmp.s); + free(bsmpl); +} + +void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl) +{ + bsmpl->ignore_rg = 1; +} + +static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name) +{ + int ismpl = -1; + if ( smpl_name ) + { + if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 ) + { + // new sample + bsmpl->nsmpl++; + bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl); + bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name); + ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]); + } + } + if ( !strcmp("*",rg_id) ) + { + // all read groups in the bam treated as the same sample + file->default_idx = ismpl; + return; + } + if ( !file->rg2idx ) file->rg2idx = khash_str2int_init(); + if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID + khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl); +} +static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name) +{ + char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only + if ( !rg_smpl ) + { + // read group specific to this bam + bsmpl->tmp.l = 0; + ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname); + rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s); + } + if ( !rg_smpl ) + { + // any read group in this file? + bsmpl->tmp.l = 0; + ksprintf(&bsmpl->tmp,"*\t%s",file->fname); + rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s); + } + if ( !rg_smpl && bsmpl->rg_logic ) return 0; + if ( rg_smpl && !bsmpl->rg_logic ) return 0; + + if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample + return 1; +} + +/* + The logic of this function is a bit complicated because we want to work + also with broken bams containing read groups that are not listed in the + header. The desired behavior is as follows: + - when -G is given, read groups which are not listed in the header must + be given explicitly using the "?" symbol in -G. + Otherwise: + - if the bam has no header, all reads in the file are assigned to a + single sample named after the file + - if there is at least one sample defined in the header, reads with no + read group id or with a read group id not listed in the header are + assigned to the first sample encountered in the header +*/ +int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname) +{ + bsmpl->nfiles++; + bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t)); + file_t *file = &bsmpl->files[bsmpl->nfiles-1]; + memset(file,0,sizeof(file_t)); + file->fname = strdup(fname); + file->default_idx = -1; + + if ( bsmpl->ignore_rg || !bam_hdr ) + { + // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name + bsmpl_add_readgroup(bsmpl,file,"*",file->fname); + return bsmpl->nfiles-1; + } + + void *bam_smpls = khash_str2int_init(); + int first_smpl = -1, nskipped = 0; + const char *p = bam_hdr, *q, *r; + while ((q = strstr(p, "@RG")) != 0) + { + p = q + 3; + r = q = 0; + if ((q = strstr(p, "\tID:")) != 0) q += 4; + if ((r = strstr(p, "\tSM:")) != 0) r += 4; + if (r && q) + { + char *u, *v; + int ioq, ior; + for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u); + for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v); + ioq = *u; ior = *v; *u = *v = '\0'; + + // q now points to a null terminated read group id + // r points to a null terminated sample name + if ( !strcmp("*",q) || !strcmp("?",q) ) + error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname); + + int accept_rg = 1; + if ( bsmpl->sample_list ) + { + // restrict samples based on the -s/-S options + char *name = khash_str2str_get(bsmpl->sample_list,r); + if ( bsmpl->sample_logic==0 ) + accept_rg = name ? 0 : 1; + else if ( !name ) + accept_rg = 0; + else + r = name; + } + if ( accept_rg && bsmpl->rg_list ) + { + // restrict readgroups based on the -G option, possibly renaming the sample + accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r); + } + if ( accept_rg ) + bsmpl_add_readgroup(bsmpl,file,q,r); + else + { + bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header + nskipped++; + } + + if ( first_smpl<0 ) + khash_str2int_get(bsmpl->name2idx,r,&first_smpl); + if ( !khash_str2int_has_key(bam_smpls,r) ) + khash_str2int_inc(bam_smpls,strdup(r)); + + *u = ioq; *v = ior; + } + else + break; + p = q > r ? q : r; + } + int nsmpls = khash_str2int_size(bam_smpls); + khash_str2int_destroy_free(bam_smpls); + + const char *smpl_name = NULL; + int accept_null_rg = 1; + if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0; + if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0; + + if ( !accept_null_rg && first_smpl==-1 ) + { + // no suitable read group is available in this bam: ignore the whole file. + free(file->fname); + bsmpl->nfiles--; + return -1; + } + if ( !accept_null_rg ) return bsmpl->nfiles-1; + if ( nsmpls==1 && !nskipped ) + { + file->default_idx = first_smpl; + return bsmpl->nfiles-1; + } + if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl]; + + bsmpl_add_readgroup(bsmpl,file,"?",smpl_name); + return bsmpl->nfiles-1; +} + +const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl) +{ + *nsmpl = bsmpl->nsmpl; + return (const char**)bsmpl->smpl; +} + +int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec) +{ + file_t *file = &bsmpl->files[bam_id]; + if ( file->default_idx >= 0 ) return file->default_idx; + + char *aux_rg = (char*) bam_aux_get(bam_rec, "RG"); + aux_rg = aux_rg ? aux_rg+1 : "?"; + + int rg_id; + if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id; + if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id; + return -1; +} + +int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file) +{ + if ( list[0]!='^' ) bsmpl->sample_logic = 1; + else list++; + + int i, nsamples = 0; + char **samples = hts_readlist(list, is_file, &nsamples); + if ( !nsamples ) return 0; + + kstring_t ori = {0,0,0}; + kstring_t ren = {0,0,0}; + + bsmpl->sample_list = khash_str2str_init(); + for (i=0; isample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s)); + free(samples[i]); + } + free(samples); + free(ori.s); + free(ren.s); + return nsamples; +} + +int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) +{ + if ( list[0]!='^' ) bsmpl->rg_logic = 1; + else list++; + + int i, nrows = 0; + char **rows = hts_readlist(list, is_file, &nrows); + if ( !nrows ) return 0; + + kstring_t fld1 = {0,0,0}; + kstring_t fld2 = {0,0,0}; + kstring_t fld3 = {0,0,0}; + + bsmpl->rg_list = khash_str2str_init(); + for (i=0; irg_list,fld1.s); + if ( !value ) + khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t")); + else if ( strcmp(value,fld2.l?fld2.s:"\t") ) + error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t"); + free(rows[i]); + } + free(rows); + free(fld1.s); + free(fld2.s); + free(fld3.s); + return nrows; +} + + diff --git a/bcftools/bam_sample.c.pysam.c b/bcftools/bam_sample.c.pysam.c new file mode 100644 index 0000000..76d7a61 --- /dev/null +++ b/bcftools/bam_sample.c.pysam.c @@ -0,0 +1,395 @@ +#include "pysam.h" + +/* bam_sample.c -- group data by sample. + + Copyright (C) 2010, 2011 Broad Institute. + Copyright (C) 2013, 2016 Genome Research Ltd. + + Author: Heng Li , Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include "bam_sample.h" +#include "bcftools.h" + + +typedef struct +{ + char *fname; + void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup + int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample +} +file_t; + +struct _bam_smpl_t +{ + kstring_t tmp; + file_t *files; + int ignore_rg, nsmpl, nfiles; + char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup + void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list + int sample_logic; // the -s/-S logic, 1: include, 0: exclude + void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list + int rg_logic; // the -G logic, 1: include, 0: exclude + void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup +}; + +bam_smpl_t *bam_smpl_init(void) +{ + bam_smpl_t *bsmpl; + bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t)); + bsmpl->name2idx = khash_str2int_init(); + return bsmpl; +} + +void bam_smpl_destroy(bam_smpl_t *bsmpl) +{ + if ( !bsmpl ) return; + if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx); + if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list); + if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list); + int i; + for (i=0; infiles; i++) + { + file_t *file = &bsmpl->files[i]; + if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx); + free(file->fname); + } + free(bsmpl->smpl); + free(bsmpl->files); + free(bsmpl->tmp.s); + free(bsmpl); +} + +void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl) +{ + bsmpl->ignore_rg = 1; +} + +static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name) +{ + int ismpl = -1; + if ( smpl_name ) + { + if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 ) + { + // new sample + bsmpl->nsmpl++; + bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl); + bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name); + ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]); + } + } + if ( !strcmp("*",rg_id) ) + { + // all read groups in the bam treated as the same sample + file->default_idx = ismpl; + return; + } + if ( !file->rg2idx ) file->rg2idx = khash_str2int_init(); + if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID + khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl); +} +static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name) +{ + char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only + if ( !rg_smpl ) + { + // read group specific to this bam + bsmpl->tmp.l = 0; + ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname); + rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s); + } + if ( !rg_smpl ) + { + // any read group in this file? + bsmpl->tmp.l = 0; + ksprintf(&bsmpl->tmp,"*\t%s",file->fname); + rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s); + } + if ( !rg_smpl && bsmpl->rg_logic ) return 0; + if ( rg_smpl && !bsmpl->rg_logic ) return 0; + + if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample + return 1; +} + +/* + The logic of this function is a bit complicated because we want to work + also with broken bams containing read groups that are not listed in the + header. The desired behavior is as follows: + - when -G is given, read groups which are not listed in the header must + be given explicitly using the "?" symbol in -G. + Otherwise: + - if the bam has no header, all reads in the file are assigned to a + single sample named after the file + - if there is at least one sample defined in the header, reads with no + read group id or with a read group id not listed in the header are + assigned to the first sample encountered in the header +*/ +int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname) +{ + bsmpl->nfiles++; + bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t)); + file_t *file = &bsmpl->files[bsmpl->nfiles-1]; + memset(file,0,sizeof(file_t)); + file->fname = strdup(fname); + file->default_idx = -1; + + if ( bsmpl->ignore_rg || !bam_hdr ) + { + // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name + bsmpl_add_readgroup(bsmpl,file,"*",file->fname); + return bsmpl->nfiles-1; + } + + void *bam_smpls = khash_str2int_init(); + int first_smpl = -1, nskipped = 0; + const char *p = bam_hdr, *q, *r; + while ((q = strstr(p, "@RG")) != 0) + { + p = q + 3; + r = q = 0; + if ((q = strstr(p, "\tID:")) != 0) q += 4; + if ((r = strstr(p, "\tSM:")) != 0) r += 4; + if (r && q) + { + char *u, *v; + int ioq, ior; + for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u); + for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v); + ioq = *u; ior = *v; *u = *v = '\0'; + + // q now points to a null terminated read group id + // r points to a null terminated sample name + if ( !strcmp("*",q) || !strcmp("?",q) ) + error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname); + + int accept_rg = 1; + if ( bsmpl->sample_list ) + { + // restrict samples based on the -s/-S options + char *name = khash_str2str_get(bsmpl->sample_list,r); + if ( bsmpl->sample_logic==0 ) + accept_rg = name ? 0 : 1; + else if ( !name ) + accept_rg = 0; + else + r = name; + } + if ( accept_rg && bsmpl->rg_list ) + { + // restrict readgroups based on the -G option, possibly renaming the sample + accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r); + } + if ( accept_rg ) + bsmpl_add_readgroup(bsmpl,file,q,r); + else + { + bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header + nskipped++; + } + + if ( first_smpl<0 ) + khash_str2int_get(bsmpl->name2idx,r,&first_smpl); + if ( !khash_str2int_has_key(bam_smpls,r) ) + khash_str2int_inc(bam_smpls,strdup(r)); + + *u = ioq; *v = ior; + } + else + break; + p = q > r ? q : r; + } + int nsmpls = khash_str2int_size(bam_smpls); + khash_str2int_destroy_free(bam_smpls); + + const char *smpl_name = NULL; + int accept_null_rg = 1; + if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0; + if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0; + + if ( !accept_null_rg && first_smpl==-1 ) + { + // no suitable read group is available in this bam: ignore the whole file. + free(file->fname); + bsmpl->nfiles--; + return -1; + } + if ( !accept_null_rg ) return bsmpl->nfiles-1; + if ( nsmpls==1 && !nskipped ) + { + file->default_idx = first_smpl; + return bsmpl->nfiles-1; + } + if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl]; + + bsmpl_add_readgroup(bsmpl,file,"?",smpl_name); + return bsmpl->nfiles-1; +} + +const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl) +{ + *nsmpl = bsmpl->nsmpl; + return (const char**)bsmpl->smpl; +} + +int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec) +{ + file_t *file = &bsmpl->files[bam_id]; + if ( file->default_idx >= 0 ) return file->default_idx; + + char *aux_rg = (char*) bam_aux_get(bam_rec, "RG"); + aux_rg = aux_rg ? aux_rg+1 : "?"; + + int rg_id; + if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id; + if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id; + return -1; +} + +int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file) +{ + if ( list[0]!='^' ) bsmpl->sample_logic = 1; + else list++; + + int i, nsamples = 0; + char **samples = hts_readlist(list, is_file, &nsamples); + if ( !nsamples ) return 0; + + kstring_t ori = {0,0,0}; + kstring_t ren = {0,0,0}; + + bsmpl->sample_list = khash_str2str_init(); + for (i=0; isample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s)); + free(samples[i]); + } + free(samples); + free(ori.s); + free(ren.s); + return nsamples; +} + +int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) +{ + if ( list[0]!='^' ) bsmpl->rg_logic = 1; + else list++; + + int i, nrows = 0; + char **rows = hts_readlist(list, is_file, &nrows); + if ( !nrows ) return 0; + + kstring_t fld1 = {0,0,0}; + kstring_t fld2 = {0,0,0}; + kstring_t fld3 = {0,0,0}; + + bsmpl->rg_list = khash_str2str_init(); + for (i=0; irg_list,fld1.s); + if ( !value ) + khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t")); + else if ( strcmp(value,fld2.l?fld2.s:"\t") ) + error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t"); + free(rows[i]); + } + free(rows); + free(fld1.s); + free(fld2.s); + free(fld3.s); + return nrows; +} + + diff --git a/bcftools/bam_sample.h b/bcftools/bam_sample.h new file mode 100644 index 0000000..5cbcc39 --- /dev/null +++ b/bcftools/bam_sample.h @@ -0,0 +1,50 @@ +/* bam_sample.h -- group data by sample. + + Copyright (C) 2010 Broad Institute. + Copyright (C) 2016 Genome Research Ltd. + + Author: Heng Li , Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef BAM_SAMPLE_H +#define BAM_SAMPLE_H + +#include + +typedef struct _bam_smpl_t bam_smpl_t; + +bam_smpl_t *bam_smpl_init(void); + +int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file); +int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file); +void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl); + +// The above should be called only before bams are added. Returns the BAM id +// to be passed to bam_smpl_get_sample_id() later. It is safe to assume +// sequential numbering, starting from 0. +// +int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname); + +const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl); +int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec); + +void bam_smpl_destroy(bam_smpl_t *bsmpl); + +#endif diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index d4e856d..7d2d49f 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -30,6 +30,7 @@ THE SOFTWARE. */ #include #include +#define FT_TAB_TEXT 0 // custom tab-delimited text file #define FT_GZ 1 #define FT_VCF 2 #define FT_VCF_GZ (FT_GZ|FT_VCF) diff --git a/bcftools/bin.c b/bcftools/bin.c new file mode 100644 index 0000000..b558b20 --- /dev/null +++ b/bcftools/bin.c @@ -0,0 +1,104 @@ +/* The MIT License + + Copyright (c) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include "bcftools.h" +#include "bin.h" + +struct _bin_t +{ + float *bins; + int nbins; +}; + +bin_t *bin_init(const char *list_def, float min, float max) +{ + bin_t *bin = (bin_t*) calloc(1,sizeof(bin_t)); + + // a comma indicates a list, otherwise a file + int is_file = strchr(list_def,',') ? 0 : 1; + int i, nlist; + char **list = hts_readlist(list_def, is_file, &nlist); + bin->nbins = nlist; + bin->bins = (float*) malloc(sizeof(float)*nlist); + for (i=0; ibins[i] = strtod(list[i],&tmp); + if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]); + if ( min!=max && (bin->bins[i]bins[i]>max) ) + error("Expected values from the interval [%f,%f], found %s\n", list[i]); + free(list[i]); + } + free(list); + + if ( min!=max ) + { + // make sure we've got both boundaries: min,max. + assert( nlist>1 ); + float max_err = (bin->bins[1] - bin->bins[0])*1e-6; + if ( fabs(bin->bins[0] - min) > max_err ) + { + bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float)); + memmove(bin->bins+1, bin->bins, sizeof(float)*(bin->nbins-1)); + bin->bins[0] = min; + } + if ( fabs(bin->bins[bin->nbins-1] - max) > max_err ) + { + bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float)); + bin->bins[bin->nbins-1] = max; + } + } + return bin; +} + +void bin_destroy(bin_t *bin) +{ + free(bin->bins); + free(bin); +} + +int bin_get_size(bin_t *bin) { return bin->nbins; } + +float bin_get_value(bin_t *bin, int idx) { return bin->bins[idx]; } + +int bin_get_idx(bin_t *bin, float value) +{ + if ( bin->bins[bin->nbins-1] < value ) return bin->nbins-1; + + // Binary search in half-closed,half-open intervals [) + int imin = 0, imax = bin->nbins - 2; + while ( iminbins[i] ) imax = i - 1; + else if ( value > bin->bins[i] ) imin = i + 1; + else return i; + } + if ( bin->bins[imax] <= value ) return imax; + return imin - 1; +} + diff --git a/bcftools/bin.c.pysam.c b/bcftools/bin.c.pysam.c new file mode 100644 index 0000000..6469b57 --- /dev/null +++ b/bcftools/bin.c.pysam.c @@ -0,0 +1,106 @@ +#include "pysam.h" + +/* The MIT License + + Copyright (c) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include "bcftools.h" +#include "bin.h" + +struct _bin_t +{ + float *bins; + int nbins; +}; + +bin_t *bin_init(const char *list_def, float min, float max) +{ + bin_t *bin = (bin_t*) calloc(1,sizeof(bin_t)); + + // a comma indicates a list, otherwise a file + int is_file = strchr(list_def,',') ? 0 : 1; + int i, nlist; + char **list = hts_readlist(list_def, is_file, &nlist); + bin->nbins = nlist; + bin->bins = (float*) malloc(sizeof(float)*nlist); + for (i=0; ibins[i] = strtod(list[i],&tmp); + if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]); + if ( min!=max && (bin->bins[i]bins[i]>max) ) + error("Expected values from the interval [%f,%f], found %s\n", list[i]); + free(list[i]); + } + free(list); + + if ( min!=max ) + { + // make sure we've got both boundaries: min,max. + assert( nlist>1 ); + float max_err = (bin->bins[1] - bin->bins[0])*1e-6; + if ( fabs(bin->bins[0] - min) > max_err ) + { + bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float)); + memmove(bin->bins+1, bin->bins, sizeof(float)*(bin->nbins-1)); + bin->bins[0] = min; + } + if ( fabs(bin->bins[bin->nbins-1] - max) > max_err ) + { + bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float)); + bin->bins[bin->nbins-1] = max; + } + } + return bin; +} + +void bin_destroy(bin_t *bin) +{ + free(bin->bins); + free(bin); +} + +int bin_get_size(bin_t *bin) { return bin->nbins; } + +float bin_get_value(bin_t *bin, int idx) { return bin->bins[idx]; } + +int bin_get_idx(bin_t *bin, float value) +{ + if ( bin->bins[bin->nbins-1] < value ) return bin->nbins-1; + + // Binary search in half-closed,half-open intervals [) + int imin = 0, imax = bin->nbins - 2; + while ( iminbins[i] ) imax = i - 1; + else if ( value > bin->bins[i] ) imin = i + 1; + else return i; + } + if ( bin->bins[imax] <= value ) return imax; + return imin - 1; +} + diff --git a/bcftools/bin.h b/bcftools/bin.h new file mode 100644 index 0000000..ab9e5b1 --- /dev/null +++ b/bcftools/bin.h @@ -0,0 +1,65 @@ +/* The MIT License + + Copyright (c) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +/* + Simple binning of float values into predefined bins +*/ + +#ifndef __BIN_H__ +#define __BIN_H__ + +#include + +typedef struct _bin_t bin_t; + +/* + * bin_init() - init bins + * @list: list of half-open intervals [). If the list does not contain commas, + * it is interpreted as a file name. + * @min,max: extreme values. This is for user convenience so that well-known + * extremes can be left out from the list. Ignored if min=max + */ +bin_t *bin_init(const char *list, float min, float max); +void bin_destroy(bin_t *bin); + +/* + * bin_get_size() - number of boundaries, subtract 1 to get the number of bins + */ +int bin_get_size(bin_t *bin); + +/* + bin_get_idx() - find the bin index which corresponds to the value (binary search) + Returns the bin index 0 <= idx <= size-2 or -1,size-1 for out of range values. + */ +int bin_get_idx(bin_t *bin, float value); + +/* + bin_get_value() - get the i-th boundary value, i=0,..,size-1 + */ +float bin_get_value(bin_t *bin, int ith); + +#endif + diff --git a/bcftools/call.h b/bcftools/call.h index bbf0a52..0d707a0 100644 --- a/bcftools/call.h +++ b/bcftools/call.h @@ -72,6 +72,7 @@ typedef struct double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes() int32_t *ugts, *cgts; // unconstraind and constrained GTs uint32_t output_tags; + char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN) // ccall only double indel_frac, min_perm_p, min_lrt; @@ -102,7 +103,7 @@ call_t; void error(const char *format, ...); /* - * *call() - return negative value on error or the number of non-reference + * call() - return -1 value on critical error; -2 to skip the site; or the number of non-reference * alleles on success. */ int mcall(call_t *call, bcf1_t *rec); // multiallic and rare-variant calling model diff --git a/bcftools/ccall.c b/bcftools/ccall.c index bb43d61..9f6958a 100644 --- a/bcftools/ccall.c +++ b/bcftools/ccall.c @@ -189,8 +189,6 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double bcf_update_info_string(call->hdr, rec, "CGT", tmp); } } - if (pr == 0) return 1; - is_var = (pr->p_ref < call->pref); r = is_var? pr->p_ref : pr->p_var; @@ -232,11 +230,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double // Remove unused alleles int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1; - if ( call->flag & CALL_KEEPALT && call->unseen>0 ) - { - assert( call->unseen==nals-1 ); - nals--; - } + if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--; if ( nalsn_allele ) { @@ -272,7 +266,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double int i; for (i=0; in_sample; i++) { - int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2; + int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2; int gt = x&3; if ( !call->ploidy || call->ploidy[i]==2 ) { diff --git a/bcftools/ccall.c.pysam.c b/bcftools/ccall.c.pysam.c index d4ceb01..1765d84 100644 --- a/bcftools/ccall.c.pysam.c +++ b/bcftools/ccall.c.pysam.c @@ -191,8 +191,6 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double bcf_update_info_string(call->hdr, rec, "CGT", tmp); } } - if (pr == 0) return 1; - is_var = (pr->p_ref < call->pref); r = is_var? pr->p_ref : pr->p_var; @@ -234,11 +232,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double // Remove unused alleles int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1; - if ( call->flag & CALL_KEEPALT && call->unseen>0 ) - { - assert( call->unseen==nals-1 ); - nals--; - } + if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--; if ( nalsn_allele ) { @@ -274,7 +268,7 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double int i; for (i=0; in_sample; i++) { - int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2; + int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2; int gt = x&3; if ( !call->ploidy || call->ploidy[i]==2 ) { diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 051f353..4fccc4f 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +36,7 @@ #include #include #include -#include +#include "regidx.h" #include "bcftools.h" #include "rbuf.h" @@ -68,6 +69,7 @@ typedef struct int nvcf_buf, rid; regidx_t *mask; + regitr_t *itr; int chain_id; // chain_id, to provide a unique ID to each chain in the chain output chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences @@ -202,6 +204,7 @@ static void init_data(args_t *args) { args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); if ( !args->mask ) error("Failed to initialize mask regions\n"); + args->itr = regitr_init(args->mask); } // In case we want to store the chains if ( args->chain_fname ) @@ -228,6 +231,7 @@ static void destroy_data(args_t *args) free(args->vcf_buf); free(args->fa_buf.s); if ( args->mask ) regidx_destroy(args->mask); + if ( args->itr ) regitr_destroy(args->itr); if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); @@ -409,12 +413,27 @@ static void apply_variant(args_t *args, bcf1_t *rec) rec->d.allele[1][0] = gt2iupac(ial,jal); } + int len_diff = 0, alen = 0; int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; - if ( idx<0 || idx>=args->fa_buf.l ) + if ( idx<0 ) + { + fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + return; + } + if ( rec->rlen > args->fa_buf.l - idx ) + { + rec->rlen = args->fa_buf.l - idx; + alen = strlen(rec->d.allele[ialt]); + if ( alen > rec->rlen ) + { + rec->d.allele[ialt][rec->rlen] = 0; + fprintf(stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + } + } + if ( idx>=args->fa_buf.l ) error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off); // sanity check the reference base - int len_diff = 0, alen = 0; if ( rec->d.allele[ialt][0]=='<' ) { if ( strcasecmp(rec->d.allele[ialt], "") ) @@ -495,18 +514,16 @@ static void mask_region(args_t *args, char *seq, int len) int start = args->fa_src_pos - len; int end = args->fa_src_pos; - regitr_t itr; - if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return; + if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return; int idx_start, idx_end, i; - while ( REGITR_OVERLAP(itr,start,end) ) + while ( regitr_overlap(args->itr) ) { - idx_start = REGITR_START(itr) - start; - idx_end = REGITR_END(itr) - start; + idx_start = args->itr->beg - start; + idx_end = args->itr->end - start; if ( idx_start < 0 ) idx_start = 0; if ( idx_end >= len ) idx_end = len - 1; for (i=idx_start; i<=idx_end; i++) seq[i] = 'N'; - itr.i++; } } @@ -519,7 +536,7 @@ static void consensus(args_t *args) { if ( str.s[0]=='>' ) { - // new sequence encountered, apply all chached variants + // new sequence encountered, apply all cached variants while ( args->vcf_rbuf.n ) { if (args->chain) { @@ -576,7 +593,17 @@ static void consensus(args_t *args) } if ( !rec_ptr ) flush_fa_buffer(args, 60); } - if (args->chain) { + bcf1_t **rec_ptr = NULL; + while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) + { + bcf1_t *rec = *rec_ptr; + if ( rec->rid!=args->rid ) break; + if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break; + if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break; + apply_variant(args, rec); + } + if (args->chain) + { print_chain(args); destroy_chain(args); } @@ -588,8 +615,11 @@ static void consensus(args_t *args) static void usage(args_t *args) { fprintf(stderr, "\n"); - fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference\n"); - fprintf(stderr, " fasta file.\n"); + fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n"); + fprintf(stderr, " file. By default, the program will apply all ALT variants. Using the\n"); + fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); + fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); + fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 91aa5ae..51d9339 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,7 @@ #include #include #include -#include +#include "regidx.h" #include "bcftools.h" #include "rbuf.h" @@ -70,6 +71,7 @@ typedef struct int nvcf_buf, rid; regidx_t *mask; + regitr_t *itr; int chain_id; // chain_id, to provide a unique ID to each chain in the chain output chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences @@ -204,6 +206,7 @@ static void init_data(args_t *args) { args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); if ( !args->mask ) error("Failed to initialize mask regions\n"); + args->itr = regitr_init(args->mask); } // In case we want to store the chains if ( args->chain_fname ) @@ -230,6 +233,7 @@ static void destroy_data(args_t *args) free(args->vcf_buf); free(args->fa_buf.s); if ( args->mask ) regidx_destroy(args->mask); + if ( args->itr ) regitr_destroy(args->itr); if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); @@ -411,12 +415,27 @@ static void apply_variant(args_t *args, bcf1_t *rec) rec->d.allele[1][0] = gt2iupac(ial,jal); } + int len_diff = 0, alen = 0; int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; - if ( idx<0 || idx>=args->fa_buf.l ) + if ( idx<0 ) + { + fprintf(pysam_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + return; + } + if ( rec->rlen > args->fa_buf.l - idx ) + { + rec->rlen = args->fa_buf.l - idx; + alen = strlen(rec->d.allele[ialt]); + if ( alen > rec->rlen ) + { + rec->d.allele[ialt][rec->rlen] = 0; + fprintf(pysam_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + } + } + if ( idx>=args->fa_buf.l ) error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off); // sanity check the reference base - int len_diff = 0, alen = 0; if ( rec->d.allele[ialt][0]=='<' ) { if ( strcasecmp(rec->d.allele[ialt], "") ) @@ -497,18 +516,16 @@ static void mask_region(args_t *args, char *seq, int len) int start = args->fa_src_pos - len; int end = args->fa_src_pos; - regitr_t itr; - if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return; + if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return; int idx_start, idx_end, i; - while ( REGITR_OVERLAP(itr,start,end) ) + while ( regitr_overlap(args->itr) ) { - idx_start = REGITR_START(itr) - start; - idx_end = REGITR_END(itr) - start; + idx_start = args->itr->beg - start; + idx_end = args->itr->end - start; if ( idx_start < 0 ) idx_start = 0; if ( idx_end >= len ) idx_end = len - 1; for (i=idx_start; i<=idx_end; i++) seq[i] = 'N'; - itr.i++; } } @@ -521,7 +538,7 @@ static void consensus(args_t *args) { if ( str.s[0]=='>' ) { - // new sequence encountered, apply all chached variants + // new sequence encountered, apply all cached variants while ( args->vcf_rbuf.n ) { if (args->chain) { @@ -578,7 +595,17 @@ static void consensus(args_t *args) } if ( !rec_ptr ) flush_fa_buffer(args, 60); } - if (args->chain) { + bcf1_t **rec_ptr = NULL; + while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) + { + bcf1_t *rec = *rec_ptr; + if ( rec->rid!=args->rid ) break; + if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break; + if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break; + apply_variant(args, rec); + } + if (args->chain) + { print_chain(args); destroy_chain(args); } @@ -590,8 +617,11 @@ static void consensus(args_t *args) static void usage(args_t *args) { fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference\n"); - fprintf(pysam_stderr, " fasta file.\n"); + fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n"); + fprintf(pysam_stderr, " file. By default, the program will apply all ALT variants. Using the\n"); + fprintf(pysam_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); + fprintf(pysam_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); + fprintf(pysam_stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(pysam_stderr, "Options:\n"); fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); diff --git a/bcftools/convert.c b/bcftools/convert.c index 3e289f0..05dce01 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -62,13 +62,19 @@ THE SOFTWARE. */ #define T_IUPAC_GT 23 #define T_GT_TO_HAP 24 // not publicly advertised #define T_GT_TO_HAP2 25 // not publicly advertised +#define T_TBCSQ 26 +#define T_END 27 +#define T_POS0 28 +#define T_END0 29 typedef struct _fmt_t { int type, id, is_gt_field, ready, subscript; char *key; bcf_fmt_t *fmt; + void *usr; // user data (optional) void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *); + void (*destroy)(void*); // clean user data (optional) } fmt_t; @@ -88,9 +94,19 @@ struct _convert_t int allow_undef_tags; }; +typedef struct +{ + kstring_t hap1,hap2; + char **str; + int n, m; +} +bcsq_t; static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); } static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); } +static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); } +static void process_end(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen, str); } +static void process_end0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen-1, str); } static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); } static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); } static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) @@ -125,7 +141,7 @@ static void process_first_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { if ( bcf_float_is_missing(line->qual) ) kputc('.', str); - else ksprintf(str, "%g", line->qual); + else kputd(line->qual, str); } static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -193,7 +209,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break; - case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break; + case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break; case BCF_BT_CHAR: kputc(info->v1.i, str); break; default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break; } @@ -215,7 +231,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break; case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; - case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH @@ -226,6 +242,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt) { fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,fmt->id) ) fmt->id = -1; fmt->fmt = NULL; if ( fmt->id >= 0 ) { @@ -261,7 +278,7 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) ) kputc('.', str); else - ksprintf(str, "%g", ptr[fmt->subscript]); + kputd(ptr[fmt->subscript], str); } else if ( fmt->fmt->type != BCF_BT_CHAR ) { @@ -316,6 +333,111 @@ static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isampl } if (l == 0) kputc('.', str); } +static void destroy_tbcsq(void *usr) +{ + if ( !usr ) return; + bcsq_t *csq = (bcsq_t*) usr; + free(csq->hap1.s); + free(csq->hap2.s); + if ( csq->n ) + free(csq->str[0]); + free(csq->str); + free(csq); +} +static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +{ + if ( !fmt->ready ) + { + init_format(convert, line, fmt); + + bcsq_t *csq; + if ( fmt->usr ) + { + csq = (bcsq_t*) fmt->usr; + if ( csq->n ) + free(csq->str[0]); + csq->n = 0; + } + else + csq = (bcsq_t*) calloc(1,sizeof(bcsq_t)); + fmt->usr = csq; + + int i=0, len = 0; + char *tmp = NULL; + if ( bcf_get_info_string(convert->header,line,fmt->key,&tmp,&len)<0 ) + { + csq->n = 0; + return; + } + do + { + csq->n++; + hts_expand(char*, csq->n, csq->m, csq->str); + csq->str[ csq->n-1 ] = tmp + i; + while ( iusr; + + if ( fmt->fmt==NULL || !csq->n ) return; + + csq->hap1.l = 0; + csq->hap2.l = 0; + + int mask = fmt->subscript==0 ? 3 : 1; // merge both haplotypes if subscript==0 + + #define BRANCH(type_t, nbits) { \ + type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \ + int i,j; \ + if ( fmt->subscript<=0 || fmt->subscript==1 ) \ + { \ + for (j=0; j < fmt->fmt->n; j++) \ + { \ + type_t val = x[j]; \ + if ( !val ) continue; \ + for (i=0; istr[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ + } \ + } \ + if ( fmt->subscript<0 || fmt->subscript==2 ) \ + { \ + for (j=0; j < fmt->fmt->n; j++) \ + { \ + type_t val = x[j]; \ + if ( !val ) continue; \ + for (i=1; istr[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ + } \ + } \ + } + switch (fmt->fmt->type) + { + case BCF_BT_INT8: BRANCH(uint8_t, 8); break; + case BCF_BT_INT16: BRANCH(uint16_t,16); break; + case BCF_BT_INT32: BRANCH(uint32_t,32); break; + default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break; + } + #undef BRANCH + + if ( !csq->hap1.l && !csq->hap2.l ) return; + + if ( csq->hap1.l ) csq->hap1.s[--csq->hap1.l] = 0; + if ( csq->hap2.l ) csq->hap2.s[--csq->hap2.l] = 0; + + if ( fmt->subscript<0 ) + { + kputs(csq->hap1.l?csq->hap1.s:".", str); + kputc_('\t', str); + kputs(csq->hap2.l?csq->hap2.s:".", str); + } + else if ( fmt->subscript<2 ) + kputs(csq->hap1.l?csq->hap1.s:".", str); + else + kputs(csq->hap2.l?csq->hap2.s:".", str); +} static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt) { init_format(convert, line, fmt); @@ -409,6 +531,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; } if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; } if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; } + if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; } } static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -597,103 +720,260 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int // the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a // heterozygous genotype of unknown phase. - int m, n, i; - - m = convert->ndat / sizeof(int32_t); - n = bcf_get_genotypes(convert->header, line, &convert->dat, &m); - convert->ndat = m * sizeof(int32_t); - - if ( n<=0 ) - { - // Throw an error or silently proceed? - // - // for (i=0; insamples; i++) kputs(" ...", str); - // return; - - error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); - } - - n /= convert->nsamples; + int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + bcf_fmt_t *fmt_gt = NULL; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } + if ( !fmt_gt ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + + // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 + if ( line->n_allele > 100 ) + error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) + error("Could not alloc %d bytes\n", str->l + convert->nsamples*8); + + if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid + error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + + int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; for (i=0; insamples; i++) { - int32_t *ptr = (int32_t*)convert->dat + i*n; - int j; - for (j=0; j0) kputs(" ", str); // no space separation for first column - if ( j==2 ) + ptr += fmt_gt->n; + if ( ptr[0]==2 ) { - // diploid - if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) { - kputs("? ?", str); + if ( ptr[1]==3 ) /* 0|0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 0|1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 0/0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 0/1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; } - else if ( bcf_gt_is_phased(ptr[1])) { - ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; } - else { - ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else /* 0/x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; } } - else if ( j==1 ) + else if ( ptr[0]==4 ) { - // haploid - if ( bcf_gt_is_missing(ptr[0]) ) - kputs("? -", str); - else if ( bcf_gt_allele(ptr[0])==1 ) - kputs("1 -", str); // first ALT allele - else - kputs("0 -", str); // REF or something else than first ALT + if ( ptr[1]==3 ) /* 1|0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 1|1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 1/0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 1/1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; + } + else /* 1/x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + } + else if ( bcf_gt_is_missing(ptr[0]) ) + { + if ( ptr[1]==bcf_int8_vector_end ) + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + } + else if ( ptr[1]==bcf_int8_vector_end ) + { + /* use REF for something else than first ALT */ + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else + { + kputw(bcf_gt_allele(ptr[0]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; } - else error("FIXME: not ready for ploidy %d\n", j); } + str->s[--str->l] = 0; // delete the last space } static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { // same as process_gt_to_hap but converts haploid genotypes into diploid - int m, n, i; - - m = convert->ndat / sizeof(int32_t); - n = bcf_get_genotypes(convert->header, line, &convert->dat, &m); - convert->ndat = m * sizeof(int32_t); - - if ( n<=0 ) - error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); - n /= convert->nsamples; + int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + bcf_fmt_t *fmt_gt = NULL; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } + if ( !fmt_gt ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + + // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 + if ( line->n_allele > 100 ) + error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) + error("Could not alloc %d bytes\n", str->l + convert->nsamples*8); + + if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid + error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + + int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; for (i=0; insamples; i++) { - int32_t *ptr = (int32_t*)convert->dat + i*n; - int j; - for (j=0; j0) kputs(" ", str); // no space separation for first column - if ( j==2 ) + ptr += fmt_gt->n; + if ( ptr[0]==2 ) { - // diploid - if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) { - kputs("? ?", str); + if ( ptr[1]==3 ) /* 0|0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 0|1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 0 -> 0|0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 0/0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 0/1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; } - else if ( bcf_gt_is_phased(ptr[1])) { - ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; } - else { - ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; + } + else /* 0/x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; } } - else if ( j==1 ) + else if ( ptr[0]==4 ) { - // haploid - if ( bcf_gt_is_missing(ptr[0]) ) - kputs("? ?", str); - else if ( bcf_gt_allele(ptr[0])==1 ) - kputs("1 1", str); // first ALT allele - else - kputs("0 0", str); // REF or something else than first ALT + if ( ptr[1]==3 ) /* 1|0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 1|1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 1 -> 1|1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 1/0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 1/1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; + } + else /* 1/x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + } + else if ( bcf_gt_is_missing(ptr[0]) ) + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) + { + /* use REF for something else than first ALT */ + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else + { + kputw(bcf_gt_allele(ptr[0]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; } - else error("FIXME: not ready for ploidy %d\n", j); } + str->s[--str->l] = 0; // delete the last space } static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) @@ -709,6 +989,8 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) fmt->key = key ? strdup(key) : NULL; fmt->is_gt_field = is_gtf; fmt->subscript = -1; + fmt->usr = NULL; + fmt->destroy = NULL; // Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags. if ( key ) @@ -718,6 +1000,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) { if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; } else if ( !strcmp("POS",key) ) { fmt->type = T_POS; } + else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; } + else if ( !strcmp("END",key) ) { fmt->type = T_END; } + else if ( !strcmp("END0",key) ) { fmt->type = T_END0; } else if ( !strcmp("ID",key) ) { fmt->type = T_ID; } else if ( !strcmp("REF",key) ) { fmt->type = T_REF; } else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } @@ -742,6 +1027,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break; case T_CHROM: fmt->handler = &process_chrom; break; case T_POS: fmt->handler = &process_pos; break; + case T_POS0: fmt->handler = &process_pos0; break; + case T_END: fmt->handler = &process_end; break; + case T_END0: fmt->handler = &process_end0; break; case T_ID: fmt->handler = &process_id; break; case T_REF: fmt->handler = &process_ref; break; case T_ALT: fmt->handler = &process_alt; break; @@ -759,15 +1047,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break; case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break; case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break; - case T_LINE: fmt->handler = &process_line; break; + case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break; + case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break; default: error("TODO: handler for type %d\n", fmt->type); } - if ( key ) + if ( key && fmt->type==T_INFO ) { - if ( fmt->type==T_INFO ) + fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,fmt->id) ) { - fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key); - if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key); + fmt->id = -1; + convert->undef_info_tag = strdup(key); } } return fmt; @@ -797,6 +1087,16 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf); else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf); else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf); + else if ( !strcmp(str.s, "TBCSQ") ) + { + fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf); + fmt->subscript = parse_subscript(&q); + if ( fmt->subscript==-1 ) + { + if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; } + } + else fmt->subscript++; + } else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); else if ( !strcmp(str.s, "INFO") ) { @@ -819,6 +1119,9 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) { if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf); else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf); + else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf); + else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf); + else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf); else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf); else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf); else if ( !strcmp(str.s, "ALT") ) @@ -903,6 +1206,8 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * default: p = parse_sep(convert, p, is_gtf); break; } } + if ( is_gtf ) + error("Could not parse the format string, missing the square bracket \"]\": %s\n", convert->format_str); if ( nsamples ) { @@ -923,7 +1228,10 @@ void convert_destroy(convert_t *convert) { int i; for (i=0; infmt; i++) + { + if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr); free(convert->fmt[i].key); + } free(convert->fmt); free(convert->undef_info_tag); free(convert->dat); @@ -984,7 +1292,7 @@ int convert_header(convert_t *convert, kstring_t *str) int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) { if ( !convert->allow_undef_tags && convert->undef_info_tag ) - error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag); + error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag); int l_ori = str->l; bcf_unpack(line, convert->max_unpack); @@ -993,17 +1301,24 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) str->l = 0; for (i=0; infmt; i++) { - // Genotype fields + // Genotype fields. if ( convert->fmt[i].is_gt_field ) { int j = i, js, k; - while ( convert->fmt[j].is_gt_field ) + while ( jnfmt && convert->fmt[j].is_gt_field ) { convert->fmt[j].ready = 0; j++; } for (js=0; jsnsamples; js++) { + // Here comes a hack designed for TBCSQ. When running on large files, + // such as 1000GP, there are too many empty fields in the output and + // it's very very slow. Therefore in case the handler does not add + // anything to the string, we trim all genotype fields enclosed in square + // brackets here. This may be changed in future, time will show... + size_t l_start = str->l; + int ks = convert->samples[js]; for (k=i; kreaders,ir)?'1':'0', str); } else if ( convert->fmt[k].handler ) + { + size_t l = str->l; convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str); + if ( l==str->l ) { str->l = l_start; break; } // only TBCSQ does this + } } } i = j-1; @@ -1027,6 +1346,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) } else if ( convert->fmt[i].handler ) convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str); + } return str->l - l_ori; } diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index 084ef50..95814b7 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -2,7 +2,7 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -64,13 +64,19 @@ THE SOFTWARE. */ #define T_IUPAC_GT 23 #define T_GT_TO_HAP 24 // not publicly advertised #define T_GT_TO_HAP2 25 // not publicly advertised +#define T_TBCSQ 26 +#define T_END 27 +#define T_POS0 28 +#define T_END0 29 typedef struct _fmt_t { int type, id, is_gt_field, ready, subscript; char *key; bcf_fmt_t *fmt; + void *usr; // user data (optional) void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *); + void (*destroy)(void*); // clean user data (optional) } fmt_t; @@ -90,9 +96,19 @@ struct _convert_t int allow_undef_tags; }; +typedef struct +{ + kstring_t hap1,hap2; + char **str; + int n, m; +} +bcsq_t; static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); } static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); } +static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); } +static void process_end(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen, str); } +static void process_end0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen-1, str); } static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); } static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); } static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) @@ -127,7 +143,7 @@ static void process_first_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { if ( bcf_float_is_missing(line->qual) ) kputc('.', str); - else ksprintf(str, "%g", line->qual); + else kputd(line->qual, str); } static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -195,7 +211,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break; - case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break; + case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break; case BCF_BT_CHAR: kputc(info->v1.i, str); break; default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; } @@ -217,7 +233,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break; case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; - case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH @@ -228,6 +244,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt) { fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,fmt->id) ) fmt->id = -1; fmt->fmt = NULL; if ( fmt->id >= 0 ) { @@ -263,7 +280,7 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) ) kputc('.', str); else - ksprintf(str, "%g", ptr[fmt->subscript]); + kputd(ptr[fmt->subscript], str); } else if ( fmt->fmt->type != BCF_BT_CHAR ) { @@ -318,6 +335,111 @@ static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isampl } if (l == 0) kputc('.', str); } +static void destroy_tbcsq(void *usr) +{ + if ( !usr ) return; + bcsq_t *csq = (bcsq_t*) usr; + free(csq->hap1.s); + free(csq->hap2.s); + if ( csq->n ) + free(csq->str[0]); + free(csq->str); + free(csq); +} +static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +{ + if ( !fmt->ready ) + { + init_format(convert, line, fmt); + + bcsq_t *csq; + if ( fmt->usr ) + { + csq = (bcsq_t*) fmt->usr; + if ( csq->n ) + free(csq->str[0]); + csq->n = 0; + } + else + csq = (bcsq_t*) calloc(1,sizeof(bcsq_t)); + fmt->usr = csq; + + int i=0, len = 0; + char *tmp = NULL; + if ( bcf_get_info_string(convert->header,line,fmt->key,&tmp,&len)<0 ) + { + csq->n = 0; + return; + } + do + { + csq->n++; + hts_expand(char*, csq->n, csq->m, csq->str); + csq->str[ csq->n-1 ] = tmp + i; + while ( iusr; + + if ( fmt->fmt==NULL || !csq->n ) return; + + csq->hap1.l = 0; + csq->hap2.l = 0; + + int mask = fmt->subscript==0 ? 3 : 1; // merge both haplotypes if subscript==0 + + #define BRANCH(type_t, nbits) { \ + type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \ + int i,j; \ + if ( fmt->subscript<=0 || fmt->subscript==1 ) \ + { \ + for (j=0; j < fmt->fmt->n; j++) \ + { \ + type_t val = x[j]; \ + if ( !val ) continue; \ + for (i=0; istr[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ + } \ + } \ + if ( fmt->subscript<0 || fmt->subscript==2 ) \ + { \ + for (j=0; j < fmt->fmt->n; j++) \ + { \ + type_t val = x[j]; \ + if ( !val ) continue; \ + for (i=1; istr[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ + } \ + } \ + } + switch (fmt->fmt->type) + { + case BCF_BT_INT8: BRANCH(uint8_t, 8); break; + case BCF_BT_INT16: BRANCH(uint16_t,16); break; + case BCF_BT_INT32: BRANCH(uint32_t,32); break; + default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break; + } + #undef BRANCH + + if ( !csq->hap1.l && !csq->hap2.l ) return; + + if ( csq->hap1.l ) csq->hap1.s[--csq->hap1.l] = 0; + if ( csq->hap2.l ) csq->hap2.s[--csq->hap2.l] = 0; + + if ( fmt->subscript<0 ) + { + kputs(csq->hap1.l?csq->hap1.s:".", str); + kputc_('\t', str); + kputs(csq->hap2.l?csq->hap2.s:".", str); + } + else if ( fmt->subscript<2 ) + kputs(csq->hap1.l?csq->hap1.s:".", str); + else + kputs(csq->hap2.l?csq->hap2.s:".", str); +} static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt) { init_format(convert, line, fmt); @@ -411,6 +533,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; } if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; } if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; } + if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; } } static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -599,103 +722,260 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int // the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a // heterozygous genotype of unknown phase. - int m, n, i; - - m = convert->ndat / sizeof(int32_t); - n = bcf_get_genotypes(convert->header, line, &convert->dat, &m); - convert->ndat = m * sizeof(int32_t); - - if ( n<=0 ) - { - // Throw an error or silently proceed? - // - // for (i=0; insamples; i++) kputs(" ...", str); - // return; - - error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); - } - - n /= convert->nsamples; + int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + bcf_fmt_t *fmt_gt = NULL; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } + if ( !fmt_gt ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + + // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 + if ( line->n_allele > 100 ) + error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) + error("Could not alloc %d bytes\n", str->l + convert->nsamples*8); + + if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid + error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + + int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; for (i=0; insamples; i++) { - int32_t *ptr = (int32_t*)convert->dat + i*n; - int j; - for (j=0; j0) kputs(" ", str); // no space separation for first column - if ( j==2 ) + ptr += fmt_gt->n; + if ( ptr[0]==2 ) { - // diploid - if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) { - kputs("? ?", str); + if ( ptr[1]==3 ) /* 0|0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 0|1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 0/0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 0/1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; } - else if ( bcf_gt_is_phased(ptr[1])) { - ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; } - else { - ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else /* 0/x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; } } - else if ( j==1 ) + else if ( ptr[0]==4 ) { - // haploid - if ( bcf_gt_is_missing(ptr[0]) ) - kputs("? -", str); - else if ( bcf_gt_allele(ptr[0])==1 ) - kputs("1 -", str); // first ALT allele - else - kputs("0 -", str); // REF or something else than first ALT + if ( ptr[1]==3 ) /* 1|0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 1|1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 1/0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 1/1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; + } + else /* 1/x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + } + else if ( bcf_gt_is_missing(ptr[0]) ) + { + if ( ptr[1]==bcf_int8_vector_end ) + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + } + else if ( ptr[1]==bcf_int8_vector_end ) + { + /* use REF for something else than first ALT */ + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; + } + else + { + kputw(bcf_gt_allele(ptr[0]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; } - else error("FIXME: not ready for ploidy %d\n", j); } + str->s[--str->l] = 0; // delete the last space } static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { // same as process_gt_to_hap but converts haploid genotypes into diploid - int m, n, i; - - m = convert->ndat / sizeof(int32_t); - n = bcf_get_genotypes(convert->header, line, &convert->dat, &m); - convert->ndat = m * sizeof(int32_t); - - if ( n<=0 ) - error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); - n /= convert->nsamples; + int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + bcf_fmt_t *fmt_gt = NULL; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } + if ( !fmt_gt ) + error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); + + // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 + if ( line->n_allele > 100 ) + error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) + error("Could not alloc %d bytes\n", str->l + convert->nsamples*8); + + if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid + error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); + + int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; for (i=0; insamples; i++) { - int32_t *ptr = (int32_t*)convert->dat + i*n; - int j; - for (j=0; j0) kputs(" ", str); // no space separation for first column - if ( j==2 ) + ptr += fmt_gt->n; + if ( ptr[0]==2 ) { - // diploid - if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) { - kputs("? ?", str); + if ( ptr[1]==3 ) /* 0|0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 0|1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 0 -> 0|0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 0/0 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 0/1 */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; } - else if ( bcf_gt_is_phased(ptr[1])) { - ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; } - else { - ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1])); + else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; + } + else /* 0/x */ + { + str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; } } - else if ( j==1 ) + else if ( ptr[0]==4 ) { - // haploid - if ( bcf_gt_is_missing(ptr[0]) ) - kputs("? ?", str); - else if ( bcf_gt_allele(ptr[0])==1 ) - kputs("1 1", str); // first ALT allele - else - kputs("0 0", str); // REF or something else than first ALT + if ( ptr[1]==3 ) /* 1|0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==5 ) /* 1|1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) /* 1 -> 1|1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==2 ) /* 1/0 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==4 ) /* 1/1 */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */ + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = ' '; + } + else /* 1/x */ + { + str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + str->s[str->l++] = '*'; str->s[str->l++] = ' '; + } + } + else if ( bcf_gt_is_missing(ptr[0]) ) + { + str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; + } + else if ( ptr[1]==bcf_int8_vector_end ) + { + /* use REF for something else than first ALT */ + str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' '; + } + else + { + kputw(bcf_gt_allele(ptr[0]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; + kputw(bcf_gt_allele(ptr[1]),str); + if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*'; + str->s[str->l++] = ' '; } - else error("FIXME: not ready for ploidy %d\n", j); } + str->s[--str->l] = 0; // delete the last space } static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) @@ -711,6 +991,8 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) fmt->key = key ? strdup(key) : NULL; fmt->is_gt_field = is_gtf; fmt->subscript = -1; + fmt->usr = NULL; + fmt->destroy = NULL; // Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags. if ( key ) @@ -720,6 +1002,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) { if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; } else if ( !strcmp("POS",key) ) { fmt->type = T_POS; } + else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; } + else if ( !strcmp("END",key) ) { fmt->type = T_END; } + else if ( !strcmp("END0",key) ) { fmt->type = T_END0; } else if ( !strcmp("ID",key) ) { fmt->type = T_ID; } else if ( !strcmp("REF",key) ) { fmt->type = T_REF; } else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } @@ -744,6 +1029,9 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break; case T_CHROM: fmt->handler = &process_chrom; break; case T_POS: fmt->handler = &process_pos; break; + case T_POS0: fmt->handler = &process_pos0; break; + case T_END: fmt->handler = &process_end; break; + case T_END0: fmt->handler = &process_end0; break; case T_ID: fmt->handler = &process_id; break; case T_REF: fmt->handler = &process_ref; break; case T_ALT: fmt->handler = &process_alt; break; @@ -761,15 +1049,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break; case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break; case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break; - case T_LINE: fmt->handler = &process_line; break; + case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break; + case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break; default: error("TODO: handler for type %d\n", fmt->type); } - if ( key ) + if ( key && fmt->type==T_INFO ) { - if ( fmt->type==T_INFO ) + fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key); + if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,fmt->id) ) { - fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key); - if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key); + fmt->id = -1; + convert->undef_info_tag = strdup(key); } } return fmt; @@ -799,6 +1089,16 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf); else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf); else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf); + else if ( !strcmp(str.s, "TBCSQ") ) + { + fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf); + fmt->subscript = parse_subscript(&q); + if ( fmt->subscript==-1 ) + { + if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; } + } + else fmt->subscript++; + } else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); else if ( !strcmp(str.s, "INFO") ) { @@ -821,6 +1121,9 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) { if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf); else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf); + else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf); + else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf); + else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf); else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf); else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf); else if ( !strcmp(str.s, "ALT") ) @@ -905,6 +1208,8 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * default: p = parse_sep(convert, p, is_gtf); break; } } + if ( is_gtf ) + error("Could not parse the format string, missing the square bracket \"]\": %s\n", convert->format_str); if ( nsamples ) { @@ -925,7 +1230,10 @@ void convert_destroy(convert_t *convert) { int i; for (i=0; infmt; i++) + { + if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr); free(convert->fmt[i].key); + } free(convert->fmt); free(convert->undef_info_tag); free(convert->dat); @@ -986,7 +1294,7 @@ int convert_header(convert_t *convert, kstring_t *str) int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) { if ( !convert->allow_undef_tags && convert->undef_info_tag ) - error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag); + error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag); int l_ori = str->l; bcf_unpack(line, convert->max_unpack); @@ -995,17 +1303,24 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) str->l = 0; for (i=0; infmt; i++) { - // Genotype fields + // Genotype fields. if ( convert->fmt[i].is_gt_field ) { int j = i, js, k; - while ( convert->fmt[j].is_gt_field ) + while ( jnfmt && convert->fmt[j].is_gt_field ) { convert->fmt[j].ready = 0; j++; } for (js=0; jsnsamples; js++) { + // Here comes a hack designed for TBCSQ. When running on large files, + // such as 1000GP, there are too many empty fields in the output and + // it's very very slow. Therefore in case the handler does not add + // anything to the string, we trim all genotype fields enclosed in square + // brackets here. This may be changed in future, time will show... + size_t l_start = str->l; + int ks = convert->samples[js]; for (k=i; kreaders,ir)?'1':'0', str); } else if ( convert->fmt[k].handler ) + { + size_t l = str->l; convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str); + if ( l==str->l ) { str->l = l_start; break; } // only TBCSQ does this + } } } i = j-1; @@ -1029,6 +1348,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) } else if ( convert->fmt[i].handler ) convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str); + } return str->l - l_ori; } diff --git a/bcftools/csq.c b/bcftools/csq.c new file mode 100644 index 0000000..b1db103 --- /dev/null +++ b/bcftools/csq.c @@ -0,0 +1,3824 @@ +/* The MIT License + + Copyright (c) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + Things that would be nice to have + - for stop-lost events (also in frameshifts) report the number of truncated aa's + - memory could be greatly reduced by indexing gff (but it is quite compact already) + - deletions that go beyond transcript boundaries are not checked at sequence level + - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16 + - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882 + + Read about transcript types here + http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html + http://www.ensembl.org/info/genome/variation/predicted_data.html + http://www.gencodegenes.org/gencode_biotypes.html + + List of supported biotypes + antisense + IG_C_gene + IG_D_gene + IG_J_gene + IG_LV_gene + IG_V_gene + lincRNA + macro_lncRNA + miRNA + misc_RNA + Mt_rRNA + Mt_tRNA + polymorphic_pseudogene + processed_transcript + protein_coding + ribozyme + rRNA + sRNA + scRNA + scaRNA + sense_intronic + sense_overlapping + snRNA + snoRNA + TR_C_gene + TR_D_gene + TR_J_gene + TR_V_gene + + The gff parsing logic + We collect features such by combining gff lines A,B,C as follows: + A .. gene line with a supported biotype + A.ID=~/^gene:/ + + B .. transcript line referencing A + B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/ + + C .. corresponding CDS, exon, and UTR lines: + C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ + + For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the + complete chain link C -> B -> A is required. For the rest, link B -> A suffices. + + + The supported consequence types, sorted by impact: + splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron) + splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron) + stop_gained .. DNA sequence variant resulting in a stop codon + frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame + stop_lost .. elongated transcript, stop codon changed + start_lost .. the first codon changed + inframe_altering .. combination of indels leading to unchanged reading frame and length + inframe_insertion .. inserted coding sequence, unchanged reading frame + inframe_deletion .. deleted coding sequence, unchanged reading frame + missense_variant .. amino acid (aa) change, unchanged length + splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron + synonymous_variant .. DNA sequence variant resulting in no amino acid change + stop_retained_variant .. different stop codon + non_coding_variant .. variant in non-coding sequence, such as RNA gene + 5_prime_UTR_variant + 3_prime_UTR_variant + intron_variant .. reported only if none of the above + intergenic_variant .. reported only if none of the above + + + The annotation algorithm. + The algorithm checks if the variant falls in a region of a supported type. The + search is performed in the following order, until a match is found: + 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences + 2. idx_utr(gf_utr_t) - check UTR hits + 3. idx_exon(gf_exon_t) - check for splice variants + 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc. + + These regidx indexes are created by parsing a gff3 file as follows: + 1. create the array "ftr" of all UTR, CDS, exons. This will be + processed later and pruned based on transcript types we want to keep. + In the same go, create the hash "id2tr" of transcripts to keep + (based on biotype) which maps from transcript_id to a transcript. At + the same time also build the hash "gid2gene" which maps from gene_id to + gf_gene_t pointer. + + 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes. + Use only features from "ftr" which are present in "id2tr". + + 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene. + + Data structures. + idx_cds, idx_utr, idx_exon, idx_tscript: + as described above, regidx structures for fast lookup of exons/transcripts + overlapping a region, the payload is a pointer to tscript.cds +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "filter.h" +#include "regidx.h" +#include "kheap.h" +#include "smpl_ilist.h" +#include "rbuf.h" + +#ifndef __FUNCTION__ +# define __FUNCTION__ __func__ +#endif + +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 + +// Definition of splice_region, splice_acceptor and splice_donor +#define N_SPLICE_DONOR 2 +#define N_SPLICE_REGION_EXON 3 +#define N_SPLICE_REGION_INTRON 8 + +// Ensembl ID format, e.g. +// ENST00000423372 for human .. ENST%011d +// ENSMUST00000120394 for mouse .. ENSMUST%011d +char ENSID_BUF[32], *ENSID_FMT = NULL; +static inline char *ENSID(uint32_t id) +{ + sprintf(ENSID_BUF,ENSID_FMT,id); + return ENSID_BUF; +} + + +#define N_REF_PAD 10 // number of bases to avoid boundary effects + +#define STRAND_REV 0 +#define STRAND_FWD 1 + +#define TRIM_NONE 0 +#define TRIM_5PRIME 1 +#define TRIM_3PRIME 2 + +// How to treat phased/unphased genotypes +#define PHASE_REQUIRE 0 // --phase r +#define PHASE_MERGE 1 // --phase m +#define PHASE_AS_IS 2 // --phase a +#define PHASE_SKIP 3 // --phase s +#define PHASE_NON_REF 4 // --phase R +#define PHASE_DROP_GT 5 // --samples - + +// Node types in the haplotype tree +#define HAP_CDS 0 +#define HAP_ROOT 1 +#define HAP_SSS 2 // start/stop/splice + +#define CSQ_PRINTED_UPSTREAM (1<<0) +#define CSQ_SYNONYMOUS_VARIANT (1<<1) +#define CSQ_MISSENSE_VARIANT (1<<2) +#define CSQ_STOP_LOST (1<<3) +#define CSQ_STOP_GAINED (1<<4) +#define CSQ_INFRAME_DELETION (1<<5) +#define CSQ_INFRAME_INSERTION (1<<6) +#define CSQ_FRAMESHIFT_VARIANT (1<<7) +#define CSQ_SPLICE_ACCEPTOR (1<<8) +#define CSQ_SPLICE_DONOR (1<<9) +#define CSQ_START_LOST (1<<10) +#define CSQ_SPLICE_REGION (1<<11) +#define CSQ_STOP_RETAINED (1<<12) +#define CSQ_UTR5 (1<<13) +#define CSQ_UTR3 (1<<14) +#define CSQ_NON_CODING (1<<15) +#define CSQ_INTRON (1<<16) +//#define CSQ_INTERGENIC (1<<17) +#define CSQ_INFRAME_ALTERING (1<<18) +#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string +#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf +#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence + +// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345 +#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ + CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ + CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ + CSQ_UPSTREAM_STOP) +#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST) + +#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) +#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) +#define CSQ_PRN_BIOTYPE CSQ_NON_CODING + +// see kput_vcsq() +const char *csq_strings[] = +{ + NULL, + "synonymous", + "missense", + "stop_lost", + "stop_gained", + "inframe_deletion", + "inframe_insertion", + "frameshift", + "splice_acceptor", + "splice_donor", + "start_lost", + "splice_region", + "stop_retained", + "5_prime_utr", + "3_prime_utr", + "non_coding", + "intron", + "intergenic", + "inframe_altering", + NULL, + NULL, + "coding_sequence" +}; + + +// GFF line types +#define GFF_TSCRIPT_LINE 1 +#define GFF_GENE_LINE 2 + + +/* + Genomic features, for fast lookup by position to overlapping features +*/ +#define GF_coding_bit 6 +#define GF_is_coding(x) ((x) & (1<5I|121ACG>A+124TA>T" + + vcrec_t + single VCF record and csq tied to this record. (Haplotype can have multiple + consequences in several VCF records. Each record can have multiple consequences + from multiple haplotypes.) + + csq_t + a top-level consequence tied to a haplotype + + vbuf_t + pos2vbuf + VCF records with the same position clustered together for a fast lookup via pos2vbuf +*/ +typedef struct _vbuf_t vbuf_t; +typedef struct _vcsq_t vcsq_t; +struct _vcsq_t +{ + uint32_t strand:1, + type:31; // one of CSQ_* types + uint32_t trid; + uint32_t biotype; // one of GF_* types + char *gene; // gene name + bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234" + kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T +}; +typedef struct +{ + bcf1_t *line; + uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved + uint32_t nfmt:4, nvcsq:28, mvcsq; + vcsq_t *vcsq; // there can be multiple consequences for a single VCF record +} +vrec_t; +typedef struct +{ + uint32_t pos; + vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf) + int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ + vcsq_t type; +} +csq_t; +struct _vbuf_t +{ + vrec_t **vrec; // buffer of VCF lines with the same position + int n, m; +}; +KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) + + +/* + Structures related to haplotype-aware consequences in coding regions + + hap_node_t + node of a haplotype tree. Each transcript has one tree + + tscript_t + despite its general name, it is intended for coding transcripts only + + hap_t + hstack_t + for traversal of the haplotype tree and braking combined + consequences into independent parts +*/ +typedef struct _hap_node_t hap_node_t; +struct _hap_node_t +{ + char *seq; // cds segment [parent_node,this_node) + char *var; // variant "ref>alt" + uint32_t type:2, // HAP_ROOT or HAP_CDS + csq:30; // this node's consequence + int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution + uint32_t rbeg; // variant's VCF position (0-based, inclusive) + int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types + uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included) + uint32_t icds; // which exon does this node's variant overlaps + hap_node_t **child, *prev; // children haplotypes and previous coding node + int nchild, mchild; + bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record + uint32_t nend; // number of haplotypes ending in this node + int *cur_child, mcur_child; // mapping from the allele to the currently active child + csq_t *csq_list; // list of haplotype's consequences, broken by position + int ncsq_list, mcsq_list; +}; +struct _tscript_t +{ + uint32_t id; // transcript id + uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive) + uint32_t strand:1, // STRAND_REV or STRAND_FWD + ncds:31, // number of exons + mcds; + gf_cds_t **cds; // ordered list of exons + char *ref; // reference sequence, padded with N_REF_PAD bases on both ends + char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends + hap_node_t *root; // root of the haplotype tree + hap_node_t **hap; // pointer to haplotype leaves, two for each sample + int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD + uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types + type:30; // one of GF_* types + gf_gene_t *gene; +}; +static inline int cmp_tscript(tscript_t **a, tscript_t **b) +{ + return ( (*a)->end < (*b)->end ) ? 1 : 0; +} +KHEAP_INIT(trhp, tscript_t*, cmp_tscript) +typedef khp_trhp_t tr_heap_t; +typedef struct +{ + hap_node_t *node; // current node + int ichild; // current child in the active node + int dlen; // total dlen, from the root to the active node + size_t slen; // total sequence length, from the root to the active node +} +hstack_t; +typedef struct +{ + int mstack; + hstack_t *stack; + tscript_t *tr; // tr->ref: spliced transcript on ref strand + kstring_t sseq; // spliced haplotype sequence on ref strand + kstring_t tseq; // the variable part of translated haplotype transcript, coding strand + kstring_t tref; // the variable part of translated reference transcript, coding strand + uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS + int upstream_stop; +} +hap_t; + + +/* + Helper structures, only for initialization + + ftr_t + temporary list of all exons, CDS, UTRs +*/ +KHASH_MAP_INIT_INT(int2tscript, tscript_t*) +KHASH_MAP_INIT_INT(int2int, int) +KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) +typedef struct +{ + int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR + uint32_t beg; + uint32_t end; + uint32_t trid; + uint32_t strand:1; // STRAND_REV,STRAND_FWD + uint32_t phase:2; // 0, 1 or 2 + uint32_t iseq:29; +} +ftr_t; +typedef struct +{ + // all exons, CDS, UTRs + ftr_t *ftr; + int nftr, mftr; + + // mapping from transcript ensembl id to gene id + kh_int2gene_t *gid2gene; + + // mapping from transcript id to tscript, for quick CDS anchoring + kh_int2tscript_t *id2tr; + + // sequences + void *seq2int; + char **seq; + int nseq, mseq; + + // ignored biotypes + void *ignored_biotypes; +} +aux_t; + +typedef struct _args_t +{ + // the main regidx lookups, from chr:beg-end to overlapping features and + // index iterator + regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + regitr_t *itr; + + // temporary structures, deleted after initializtion + aux_t init; + + // text tab-delimited output (out) or vcf/bcf output (out_fh) + FILE *out; + htsFile *out_fh; + + // vcf + bcf_srs_t *sr; + bcf_hdr_t *hdr; + int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values() + + // include or exclude sites which match the filters + filter_t *filter; + char *filter_str; + int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE + + // samples to process + int sample_is_file; + char *sample_list; + smpl_ilist_t *smpl; + + char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; + char *bcsq_tag; + int argc, output_type; + int phase, quiet, local_csq; + int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ + int ncsq_small_warned; + + int rid; // current chromosome + tr_heap_t *active_tr; // heap of active transcripts for quick flushing + hap_t *hap; // transcript haplotype recursion + vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush + rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf + kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position + tscript_t **rm_tr; // buffer of transcripts to clean + int nrm_tr, mrm_tr; + csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs + int ncsq_buf, mcsq_buf; + + faidx_t *fai; + kstring_t str, str2; + int32_t *gt_arr, mgt_arr; +} +args_t; + +// AAA, AAC, ... +const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; +const uint8_t nt4[] = +{ + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4, + 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 3 +}; +const uint8_t cnt4[] = +{ + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4, + 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 0 +}; +#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] +#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] + +static const char *gf_strings_noncoding[] = +{ + "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", + "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", + "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", + "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" +}; +static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; +static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; + +const char *gf_type2gff_string(int type) +{ + if ( !GF_is_coding(type) ) + { + if ( type < (1<init; + char c = chr_end[1]; + chr_end[1] = 0; + int iseq; + if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + { + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = strdup(chr_beg); + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 256 ); // see gf_gene_t.iseq + } + chr_end[1] = c; + return iseq; +} +static inline char *gff_skip(const char *line, char *ss) +{ + while ( *ss && *ss!='\t' ) ss++; + if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return ss+1; +} +static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end) +{ + char *se = (char*) line; + while ( *se && *se!='\t' ) se++; + if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + *chr_beg = (char*) line; + *chr_end = se-1; +} +static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) +{ + char *se = ss; + *beg = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); + ss = se+1; + *end = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return se+1; +} +static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss) +{ + ss = strstr(ss,needle); + if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); + ss += strlen(needle); + while ( *ss && !isdigit(*ss) ) ss++; + if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); + char *se; + uint32_t id = strtol(ss, &se, 10); + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); + if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice + return id; +} +static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss) +{ + ss = strstr(ss,needle); + if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); + ss += strlen(needle); + char *se = ss; + while ( *se && !isdigit(*se) ) se++; + kstring_t str = {0,0,0}; + kputsn(ss,se-ss,&str); + ss = se; + while ( *se && isdigit(*se) ) se++; + ksprintf(&str,"%%0%dd",(int)(se-ss)); + ENSID_FMT = str.s; +} +static inline int gff_parse_type(char *line) +{ + line = strstr(line,"ID="); + if ( !line ) return -1; + line += 3; + if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE; + else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE; + return -1; +} +static inline int gff_parse_biotype(char *_line) +{ + char *line = strstr(_line,"biotype="); + if ( !line ) return -1; + + line += 8; + switch (*line) + { + case 'p': + if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; + else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; + else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; + else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; + break; + case 'a': + if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; + else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; + else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; + break; + case 'I': + if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C; + else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D; + else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J; + else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV; + else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V; + else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; + else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; + else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; + else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; + break; + case 'T': + if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C; + else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D; + else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J; + else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V; + else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; + else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; + break; + case 'M': + if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; + else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; + break; + case 'l': + if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; + break; + case 'm': + if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; + else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; + else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; + break; + case 'r': + if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; + else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; + else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; + else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; + break; + case 's': + if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; + else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; + else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; + else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; + else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; + else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; + else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; + break; + case 't': + if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; + break; + case 'n': + if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; + else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; + break; + case 'k': + if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; + break; + case 'u': + if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; + break; + case 'L': + if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; + break; + case '3': + if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; + break; + case 'd': + if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; + break; + case 'v': + if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; + break; + case 'b': + if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; + break; + } + return 0; +} +static inline int gff_ignored_biotype(args_t *args, char *ss) +{ + ss = strstr(ss,"biotype="); + if ( !ss ) return 0; + + ss += 8; + char *se = ss, tmp; + while ( *se && *se!=';' ) se++; + tmp = *se; + *se = 0; + + char *key = ss; + int n = 0; + if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); + khash_str2int_set(args->init.ignored_biotypes, key, n+1); + + *se = tmp; + return 1; +} +gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) +{ + khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); + gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); + if ( !gene ) + { + gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); + int ret; + k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); + kh_val(aux->gid2gene,k) = gene; + } + return gene; +} +void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) +{ + aux_t *aux = &args->init; + int biotype = gff_parse_biotype(ss); + if ( biotype <= 0 ) + { + if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored transcript: %s\n",line); + return; + } + + // create a mapping from transcript_id to gene_id + uint32_t trid = gff_parse_id(line, "ID=transcript:", ss); + uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss); + + if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species + + tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); + tr->id = trid; + tr->strand = ftr->strand; + tr->gene = gene_init(aux, gene_id); + tr->type = biotype; + tr->beg = ftr->beg; + tr->end = ftr->end; + + khint_t k; + int ret; + k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); + kh_val(aux->id2tr,k) = tr; +} +void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr) +{ + int biotype = gff_parse_biotype(ss); + if ( biotype <= 0 ) + { + if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored gene: %s\n",line); + return; + } + + aux_t *aux = &args->init; + + // substring search for "ID=gene:ENSG00000437963" + uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss); + gf_gene_t *gene = gene_init(aux, gene_id); + assert( !gene->name ); // the gene_id should be unique + + gene->iseq = feature_set_seq(args, chr_beg,chr_end); + + // substring search for "Name=OR4F5" + ss = strstr(chr_end+2,"Name="); + if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line); + ss += 5; + char *se = ss; + while ( *se && *se!=';' && !isspace(*se) ) se++; + gene->name = (char*) malloc(se-ss+1); + memcpy(gene->name,ss,se-ss); + gene->name[se-ss] = 0; +} +int gff_parse(args_t *args, char *line, ftr_t *ftr) +{ + // - skip empty lines and commented lines + // - columns + // 1. chr + // 2. + // 3. CDS, transcript, gene, ... + // 4-5. beg,end + // 6. + // 7. strand + // 8. phase + // 9. Parent=transcript:ENST(\d+);ID=... etc + + char *ss = line; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *chr_beg, *chr_end; + gff_parse_chr(line, &chr_beg, &chr_end); + ss = gff_skip(line, chr_end + 2); + + // 3. column: is this a CDS, transcript, gene, etc. + if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } + else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } + else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } + else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } + else + { + ss = gff_skip(line, ss); + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + ss = gff_skip(line, ss); + int type = gff_parse_type(ss); + if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) + { + // we ignore these, debug print to see new types: + ss = strstr(ss,"ID="); + if ( !ss ) return -1; // no ID, ignore the line + if ( !strncmp("chromosome",ss+3,10) ) return -1; + if ( !strncmp("supercontig",ss+3,11) ) return -1; + if ( args->quiet<2 ) fprintf(stderr,"ignored: %s\n", line); + return -1; + } + + // 7. column: strand + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + else error("Unknown strand: %c .. %s\n", *ss,ss); + + if ( type==GFF_TSCRIPT_LINE ) + gff_parse_transcript(args, line, ss, ftr); + else + gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr); + + return -1; + } + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + ss = gff_skip(line, ss); + + // 7. column: strand + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } + ss += 2; + + // 8. column: phase (codon offset) + if ( *ss == '0' ) ftr->phase = 0; + else if ( *ss == '1' ) ftr->phase = 1; + else if ( *ss == '2' ) ftr->phase = 2; + else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase + else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } + ss += 2; + + // substring search for "Parent=transcript:ENST00000437963" + ftr->trid = gff_parse_id(line, "Parent=transcript:", ss); + ftr->iseq = feature_set_seq(args, chr_beg,chr_end); + return 0; +} + +static int cmp_cds_ptr(const void *a, const void *b) +{ + // comparison function for qsort of transcripts's CDS + if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; + if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; + return 0; +} + +static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +{ + *chr_beg = *chr_end = aux->seq[iseq]; + while ( (*chr_end)[1] ) (*chr_end)++; +} +tscript_t *tscript_init(aux_t *aux, uint32_t trid) +{ + khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); + tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); + assert( tr ); + return tr; +} +void register_cds(args_t *args, ftr_t *ftr) +{ + // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. + // ftr is the result of parsing a gff CDS line + aux_t *aux = &args->init; + + tscript_t *tr = tscript_init(aux, ftr->trid); + if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); + + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); + cds->tr = tr; + cds->beg = ftr->beg; + cds->len = ftr->end - ftr->beg + 1; + cds->icds = 0; // to keep valgrind on mac happy + cds->phase = ftr->phase; + + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); + tr->cds[tr->ncds++] = cds; +} +void register_utr(args_t *args, ftr_t *ftr) +{ + aux_t *aux = &args->init; + gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); + utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; + utr->beg = ftr->beg; + utr->end = ftr->end; + utr->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); +} +void register_exon(args_t *args, ftr_t *ftr) +{ + aux_t *aux = &args->init; + gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); + exon->beg = ftr->beg; + exon->end = ftr->end; + exon->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); +} + +void tscript_init_cds(args_t *args) +{ + aux_t *aux = &args->init; + + // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) + khint_t k; + for (k=0; kid2tr); k++) + { + if ( !kh_exist(aux->id2tr, k) ) continue; + tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k); + + // position-to-tscript lookup + char *chr_beg, *chr_end; + chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); + + if ( !tr->ncds ) continue; // transcript with no CDS + + // sort CDs + qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); + + // trim non-coding start + int i, len = 0; + if ( tr->strand==STRAND_FWD ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + + // sanity check phase + for (i=0; incds; i++) + { + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3) + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + assert( phase == len%3 ); + len += tr->cds[i]->len; + } + } + else + { + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + tr->cds[i]->len -= tr->cds[i]->phase; + tr->cds[i]->phase = 0; + + // sanity check phase + for (i=tr->ncds-1; i>=0; i--) + { + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3) + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + len += tr->cds[i]->len; + } + } + + // set len. At the same check that CDS within a transcript do not overlap + len = 0; + for (i=0; incds; i++) + { + tr->cds[i]->icds = i; + len += tr->cds[i]->len; + if ( !i ) continue; + + gf_cds_t *a = tr->cds[i-1]; + gf_cds_t *b = tr->cds[i]; + if ( a->beg + a->len - 1 >= b->beg ) + error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n", + kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + } + if ( len%3 != 0 ) + { + // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 + // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 + // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. + + tr->trim |= TRIM_3PRIME; + if ( tr->strand==STRAND_FWD ) + { + i = tr->ncds - 1; + while ( i>=0 && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + len -= dlen; + i--; + } + } + else + { + i = 0; + while ( incds && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + tr->cds[i]->beg += dlen; + len -= dlen; + i++; + } + } + } + + // set CDS offsets and insert into regidx + len=0; + for (i=0; incds; i++) + { + tr->cds[i]->pos = len; + len += tr->cds[i]->len; + regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); + } + } +} + +void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } +void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); } + +void init_gff(args_t *args) +{ + aux_t *aux = &args->init; + aux->seq2int = khash_str2int_init(); // chrom's numeric id + aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene + aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t + args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); + aux->ignored_biotypes = khash_str2int_init(); + + // parse gff + kstring_t str = {0,0,0}; + htsFile *fp = hts_open(args->gff_fname,"r"); + if ( !fp ) error("Failed to read %s\n", args->gff_fname); + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); + int ret = gff_parse(args, str.s, aux->ftr + aux->nftr); + if ( !ret ) aux->nftr++; + } + free(str.s); + if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname); + + + // process gff information: connect CDS and exons to transcripts + args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); + args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); + args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); + args->itr = regitr_init(NULL); + + int i; + for (i=0; inftr; i++) + { + ftr_t *ftr = &aux->ftr[i]; + + // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? + khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); + if ( k==kh_end(aux->id2tr) ) continue; // no such transcript + + tscript_t *tr = kh_val(aux->id2tr,k); + if ( !tr->gene->name ) + { + // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript) + regidx_free_tscript(&tr); + kh_del(int2tscript, aux->id2tr,k); + continue; + } + + // populate regidx by category: + // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 + // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... + if ( ftr->type==GF_CDS ) register_cds(args, ftr); + else if ( ftr->type==GF_EXON ) register_exon(args, ftr); + else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); + else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); + else + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type)); + } + tscript_init_cds(args); + + if ( !args->quiet ) + { + fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + regidx_nregs(args->idx_tscript), + regidx_nregs(args->idx_exon), + regidx_nregs(args->idx_cds), + regidx_nregs(args->idx_utr)); + } + + free(aux->ftr); + khash_str2int_destroy_free(aux->seq2int); + // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); + kh_destroy(int2tscript,aux->id2tr); + free(aux->seq); + + if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) + { + khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; + fprintf(stderr,"Ignored the following biotypes:\n"); + for (i = kh_begin(ign); i < kh_end(ign); i++) + { + if ( !kh_exist(ign,i)) continue; + fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); + } + } + khash_str2int_destroy_free(aux->ignored_biotypes); +} + +void init_data(args_t *args) +{ + args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; + + if ( !args->quiet ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); + init_gff(args); + + args->rid = -1; + + if ( args->filter_str ) + args->filter = filter_init(args->hdr, args->filter_str); + + args->fai = fai_load(args->fa_fname); + if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); + + args->pos2vbuf = kh_init(pos2vbuf); + args->active_tr = khp_init(trhp); + args->hap = (hap_t*) calloc(1,sizeof(hap_t)); + + // init samples + if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT; + if ( args->sample_list && !strcmp("-",args->sample_list) ) + { + // ignore all samples + if ( args->output_type==FT_TAB_TEXT ) + { + // significant speedup for plain VCFs + bcf_hdr_set_samples(args->hdr,NULL,0); + } + args->phase = PHASE_DROP_GT; + } + else + args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT); + args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr); + + if ( args->output_type==FT_TAB_TEXT ) + { + args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout; + if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); + + fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); + fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); + int i; + for (i=1; iargc; i++) + fprintf(args->out," %s",args->argv[i]); + fprintf(args->out,"\n"); + fprintf(args->out,"# LOG\t[2]Message\n"); + fprintf(args->out,"# CSQ"); i = 1; + fprintf(args->out,"\t[%d]Sample", ++i); + fprintf(args->out,"\t[%d]Haplotype", ++i); + fprintf(args->out,"\t[%d]Chromosome", ++i); + fprintf(args->out,"\t[%d]Position", ++i); + fprintf(args->out,"\t[%d]Consequence", ++i); + fprintf(args->out,"\n"); + } + else + { + args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); + bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); + bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); + if ( args->hdr_nsmpl ) + bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); + bcf_hdr_write(args->out_fh, args->hdr); + } + if ( !args->quiet ) fprintf(stderr,"Calling...\n"); +} + +void destroy_data(args_t *args) +{ + regidx_destroy(args->idx_cds); + regidx_destroy(args->idx_utr); + regidx_destroy(args->idx_exon); + regidx_destroy(args->idx_tscript); + regitr_destroy(args->itr); + + khint_t k,i,j; + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(args->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k); + free(gene->name); + free(gene); + } + kh_destroy(int2gene,args->init.gid2gene); + + if ( args->filter ) + filter_destroy(args->filter); + + khp_destroy(trhp,args->active_tr); + kh_destroy(pos2vbuf,args->pos2vbuf); + if ( args->smpl ) smpl_ilist_destroy(args->smpl); + int ret; + if ( args->out_fh ) + ret = hts_close(args->out_fh); + else + ret = fclose(args->out); + if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + for (i=0; ivcf_rbuf.m; i++) + { + vbuf_t *vbuf = args->vcf_buf[i]; + if ( !vbuf ) continue; + for (j=0; jm; j++) + { + if ( !vbuf->vrec[j] ) continue; + if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line); + free(vbuf->vrec[j]->smpl); + free(vbuf->vrec[j]->vcsq); + free(vbuf->vrec[j]); + } + free(vbuf->vrec); + free(vbuf); + } + free(args->vcf_buf); + free(args->rm_tr); + free(args->csq_buf); + free(args->hap->stack); + free(args->hap->sseq.s); + free(args->hap->tseq.s); + free(args->hap->tref.s); + free(args->hap); + fai_destroy(args->fai); + free(args->gt_arr); + free(args->str.s); + free(args->str2.s); + free(ENSID_FMT); +} + +/* + The splice_* functions are for consquences around splice sites: start,stop,splice_* + */ +#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely +#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region +#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed +#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq +typedef struct +{ + tscript_t *tr; + struct { + int32_t pos, rlen, alen; + char *ref, *alt; + bcf1_t *rec; + } vcf; + uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev) + check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon + check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon + check_donor:1, // as with check_acceptor + check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon + check_region_end:1, // + check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr + set_refalt:1; // set kref,kalt, if set, check also for synonymous events + uint32_t csq; + int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele + uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives + ref_end; // a more conservative csq (the first and last base in kref.s) + kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP +} +splice_t; +void splice_init(splice_t *splice, bcf1_t *rec) +{ + memset(splice,0,sizeof(*splice)); + splice->vcf.rec = rec; + splice->vcf.pos = rec->pos; + splice->vcf.rlen = rec->rlen; + splice->vcf.ref = rec->d.allele[0]; +} +static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) +{ + // len>0 .. beg is the first base, del filled from right + // len<0 .. beg is the last base, del filled from left + + int rlen, alen, rbeg, abeg; // first base to include (ref coordinates) + if ( len<0 ) + { + rlen = alen = -len; + rbeg = beg - rlen + 1; + int dlen = splice->vcf.alen - splice->vcf.rlen; + if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle + dlen += splice->ref_end - beg; + abeg = rbeg + dlen; + } + else + { + rbeg = abeg = beg; + rlen = alen = len; + // check for incomplete del as above?? + } + +#define XDBG 0 +#if XDBG +fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); +#endif + splice->kref.l = 0; + splice->kalt.l = 0; + + // add the part before vcf.ref, in the vcf.ref and after vcf.ref + int roff; // how many vcf.ref bases already used + if ( rbeg < splice->vcf.pos ) + { + assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD + kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); + roff = 0; + } + else + roff = rbeg - splice->vcf.pos; +#if XDBG +fprintf(stderr,"r1: %s roff=%d\n",splice->kref.s,roff); +#endif + + if ( roff < splice->vcf.rlen && splice->kref.l < rlen ) + { + int len = splice->vcf.rlen - roff; // len still available in vcf.ref + if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed + kputsn(splice->vcf.ref + roff, len, &splice->kref); + } +#if XDBG +fprintf(stderr,"r2: %s\n",splice->kref.s); +#endif + + uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele + if ( splice->kref.l < rlen ) + { + if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD) + rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end; + if ( splice->kref.l < rlen ) + kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); + } +#if XDBG +fprintf(stderr,"r3: %s\n",splice->kref.s); +#endif + + + int aoff; + if ( abeg < splice->vcf.pos ) + { + assert( splice->tr->beg <= abeg ); + kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); + aoff = 0; + } + else + aoff = abeg - splice->vcf.pos; +#if XDBG +fprintf(stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff); +#endif + + if ( aoff < splice->vcf.alen && splice->kalt.l < alen ) + { + int len = splice->vcf.alen - aoff; // len still available in vcf.alt + if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed + kputsn(splice->vcf.alt + aoff, len, &splice->kalt); + aoff -= len; + } + if ( aoff < 0 ) aoff = 0; + else aoff--; +#if XDBG +fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); +#endif + + end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele + if ( splice->kalt.l < alen ) + { + if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long + alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end; + if ( alen > 0 && alen > splice->kalt.l ) + kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); + } +#if XDBG +fprintf(stderr,"a3: %s\n",splice->kalt.s); +fprintf(stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); +#endif +} +void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); +static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid) +{ + while ( regitr_overlap(itr) ) + { + gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); + tscript_t *tr = utr->tr; + if ( tr->id != trid ) continue; + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); + return csq.type.type; + } + return 0; +} +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type) +{ +#if XDBG +fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); +#endif + if ( !type ) return; + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = type; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); +} +static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp + // before and after the inserted bases + if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] ) + { + splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; + } + else + { + if ( splice->tend ) splice->tend--; + splice->ref_beg = splice->vcf.pos; + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; + } +#if XDBG +fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +#endif + + int ret; + if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon + { + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr + { + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + if ( ret!=0 ) + { + regitr_destroy(itr); + return SPLICE_OUTSIDE; // overlaps utr + } + } + regitr_destroy(itr); + } + if ( !splice->check_region_end ) return SPLICE_OUTSIDE; + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_beg < ex_end + N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon + { + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr + { + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + if ( ret!=0 ) + { + regitr_destroy(itr); + return SPLICE_OUTSIDE; // overlaps utr + } + } + regitr_destroy(itr); + } + if ( !splice->check_region_beg ) return SPLICE_OUTSIDE; + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON); + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_end > ex_beg - N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + // overlaps the exon or inside the exon + // possible todo: find better alignment for frameshifting variants? + if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp + { + if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->ref_end > ex_end - 2 ) + { + if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->set_refalt ) + { + // Make sure the variant will not end up left aligned to avoid overlapping vcf records + // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1); + // splice->vcf.rlen -= splice->tbeg + splice->tend - 1; + // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } + if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right + { + int dlen = splice->vcf.pos - splice->ref_beg; + assert( dlen==1 ); + splice->tbeg += dlen; + if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen; + splice->ref_beg = splice->vcf.pos; + } + if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele + splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1); + splice->vcf.rlen -= splice->tbeg + splice->tend - 1; + if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_INSIDE; +} + +static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG + splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base + +#if XDBG +fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +#endif + + if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 + { + if ( splice->check_region_beg ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + // filling from the left does not work for ENST00000341065/frame3.vcf + // CAG.GTGGCCAG CAG.GTGGCCAG + // CA-.--GGCCAG vs CAG.---GCCAG + // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON); + // + // filling from the right: + splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON); + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + } + } + if ( splice->ref_end >= ex_beg ) + { + splice->tbeg = splice->ref_beg - splice->vcf.pos + 1; + splice->ref_beg = ex_beg - 1; + if ( splice->tbeg + splice->tend == splice->vcf.alen ) + { + // the deletion overlaps ex_beg and cannot be easily realigned to the right + if ( !splice->tend ) + { + splice->csq |= CSQ_CODING_SEQUENCE; + return SPLICE_OVERLAP; + } + splice->tend--; + } + } + } + if ( ex_end < splice->ref_end ) // the part after the exon + { + if ( splice->check_region_end ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_beg < ex_end + N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + } + } + if ( splice->ref_beg < ex_end ) + { + splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); + splice->ref_end = ex_end; + } + } + if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end ) + { + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + + if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 + { + if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->ref_end > ex_end - 3 ) + { + if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->set_refalt ) + { + if ( splice->tbeg>0 ) splice->tbeg--; //why is this? + if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend ) + { + splice->vcf.rlen -= splice->tbeg + splice->tend; + splice->vcf.alen -= splice->tbeg + splice->tend; + } + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); + if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf + { + splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; + return SPLICE_OVERLAP; + } + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_INSIDE; +} + +static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + // not a real variant, can be ignored: eg ACGT>ACGT + if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF; + + splice->ref_beg = splice->vcf.pos + splice->tbeg; + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; + +#if XDBG +fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +#endif + + if ( splice->ref_beg < ex_beg ) // the part before the exon + { + if ( splice->check_region_beg ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) + splice->csq |= CSQ_SPLICE_REGION; + if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + } + } + } + if ( splice->ref_end >= ex_beg ) + { + splice->tbeg = splice->ref_beg - splice->vcf.pos; + splice->ref_beg = ex_beg; + } + } + if ( ex_end < splice->ref_end ) // the part after the exon + { + if ( splice->check_region_end ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) + splice->csq |= CSQ_SPLICE_REGION; + if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + } + } + } + if ( splice->ref_beg <= ex_end ) + { + splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); + splice->ref_end = ex_end; + } + } + if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end ) + { + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + + if ( splice->ref_beg < ex_beg + 3 ) + { + if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->ref_end > ex_end - 3 ) + { + if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->set_refalt ) + { + splice->vcf.rlen -= splice->tbeg + splice->tend; + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_INSIDE; +} +static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + splice->csq = 0; + splice->vcf.alen = strlen(splice->vcf.alt); + + int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; + splice->tbeg = 0, splice->tend = 0; + + // trim from the right, then from the left + while ( i<=rlen1 && i<=alen1 ) + { + if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break; + i++; + } + splice->tend = i; + rlen1 -= i, alen1 -= i, i = 0; + while ( i<=rlen1 && i<=alen1 ) + { + if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break; + i++; + } + splice->tbeg = i; + + // The mnp, ins and del code was split into near-identical functions for clarity and debugging; + // possible todo: generalize once stable + if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end); + if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end); + if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end); + + return 0; +} + +// return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref) +int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) +{ + int i; + kstring_t str = {0,0,0}; + tscript_t *tr = cds->tr; + child->icds = cds->icds; // index of cds in the tscript's list of exons + + splice_t splice; + splice_init(&splice, rec); + splice.tr = tr; + splice.vcf.alt = rec->d.allele[ial]; + splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1; + if ( !(tr->trim & TRIM_5PRIME) ) + { + if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; } + else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; } + } + if ( !(tr->trim & TRIM_3PRIME) ) + { + if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; } + else { if ( child->icds==0 ) splice.check_stop = 1; } + } + if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M + { + if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + } + if ( child->icds!=0 ) splice.check_region_beg = 1; + if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; + +#if XDBG +fprintf(stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); +#endif + int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); +#if XDBG +fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq); +#endif + + if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA + if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq + { + free(splice.kref.s); + free(splice.kalt.s); + + if ( !splice.csq ) return 2; // fully intronic, no csq + + // splice_region/acceptor/donor + child->seq = NULL; + child->sbeg = 0; + child->rbeg = rec->pos; + child->rlen = 0; + child->dlen = 0; + kputs(rec->d.allele[0],&str); + kputc('>',&str); + kputs(rec->d.allele[ial],&str); + child->var = str.s; + child->type = HAP_SSS; + child->csq = splice.csq; + child->prev = parent->type==HAP_SSS ? parent->prev : parent; + child->rec = rec; + return 0; + } + if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice + + int dbeg = 0; + if ( splice.ref_beg < cds->beg ) + { + // The vcf record overlaps the exon boundary, but the variant itself + // should fit inside since we are here. This will need more work. + // #1475227917 + dbeg = cds->beg - splice.ref_beg; + splice.kref.l -= dbeg; + splice.ref_beg = cds->beg; + assert( dbeg <= splice.kalt.l ); + } + + if ( parent->type==HAP_SSS ) parent = parent->prev; + if ( parent->type==HAP_CDS ) + { + i = parent->icds; + if ( i!=cds->icds ) + { + // the variant is on a new exon, finish up the previous + int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg; + if ( len > 0 ) + kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + } + + // append any skipped non-variant exons + while ( ++i < cds->icds ) + kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); + + if ( parent->icds==child->icds ) + { + int len = splice.ref_beg - parent->rbeg - parent->rlen; + if ( len < 0 ) // overlapping variants + { + free(str.s); + return 1; + } + kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + } + else + kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); + } + kputs(splice.kalt.s + dbeg, &str); + + child->seq = str.s; + child->sbeg = cds->pos + (splice.ref_beg - cds->beg); + child->rbeg = splice.ref_beg; + child->rlen = splice.kref.l; + child->type = HAP_CDS; + child->prev = parent; + child->rec = rec; + child->csq = splice.csq; + + // set vlen and the "ref>alt" string + { + int rlen = strlen(rec->d.allele[0]); + int alen = strlen(rec->d.allele[ial]); + child->dlen = alen - rlen; + child->var = (char*) malloc(rlen+alen+2); + memcpy(child->var,rec->d.allele[0],rlen); + child->var[rlen] = '>'; + memcpy(child->var+rlen+1,rec->d.allele[ial],alen); + child->var[rlen+alen+1] = 0; + } + + // yuck, the whole CDS is modified/deleted, not ready for this, todo. + if ( child->rbeg + child->rlen > cds->beg + cds->len ) + { + child->type = HAP_SSS; + if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf + } + + free(splice.kref.s); + free(splice.kalt.s); + return 0; +} +void hap_destroy(hap_node_t *hap) +{ + int i; + for (i=0; inchild; i++) + if ( hap->child[i] ) hap_destroy(hap->child[i]); + for (i=0; imcsq_list; i++) free(hap->csq_list[i].type.vstr.s); + free(hap->csq_list); + free(hap->child); + free(hap->cur_child); + free(hap->seq); + free(hap->var); + free(hap); +} + + +/* + ref: spliced reference and its length (ref.l) + seq: part of the spliced query transcript on the reference strand to translate, its + length (seq.l) and the total length of the complete transcript (seq.m) + sbeg: seq offset within the spliced query transcript + rbeg: seq offset within ref, 0-based + rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l + strand: coding strand - 0:rev, 1:fwd + tseq: translated sequence (aa) + fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev) + */ +void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) +{ +#if XDBG +fprintf(stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); +#endif + char tmp[3], *codon, *end; + int i, len, npad; + + kstring_t ref = *_ref; + kstring_t seq = *_seq; + + tseq->l = 0; + if ( !seq.l ) + { + kputc('?', tseq); + return; + } + +#define DBG 0 +#if DBG + fprintf(stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); + fprintf(stderr," ref: l=%d %s\n", (int)ref.l,ref.s); + fprintf(stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m); + for (i=0; i1 + fprintf(stderr," npad: %d\n",npad); +#endif + assert( npad<=rbeg ); + + for (i=0; i1 + fprintf(stderr,"\t i=%d\n", i); +#endif + if ( i==3 ) + { + kputc_(dna2aa(tmp), tseq); +#if DBG>1 + fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); +#endif + codon = seq.s + 3 - npad; // next codon + end = codon + len - 1 - (len % 3); // last position of a valid codon + while ( codon < end ) + { + kputc_(dna2aa(codon), tseq); +#if DBG>1 + fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); +#endif + codon += 3; + } + end = seq.s + seq.l - 1; + for (i=0; codon+i<=end; i++) tmp[i] = codon[i]; + } + + // right padding + codon = ref.s + rend + N_REF_PAD; + if ( i>0 ) + { +#if DBG>1 + if(i==1)fprintf(stderr,"[3]%c\n",tmp[0]); + if(i==2)fprintf(stderr,"[3]%c%c\n",tmp[0],tmp[1]); +#endif + for (; i<3; i++) + { + tmp[i] = *codon; + codon++; + } + kputc_(dna2aa(tmp), tseq); +#if DBG>1 + fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); +#endif + } + if ( fill!=0 ) + { + end = ref.s + ref.l - N_REF_PAD; + while ( codon+3 <= end ) + { + kputc_(dna2aa(codon), tseq); +#if DBG>1 + fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); +#endif + codon += 3; + } + } + } + else // STRAND_REV + { + // right padding - number of bases to take from ref + npad = (seq.m - (sbeg + seq.l)) % 3; +#if DBG>1 + fprintf(stderr," npad: %d\n",npad); +#endif +if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); + assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand + + if ( npad==2 ) + { + tmp[1] = ref.s[rend+N_REF_PAD]; + tmp[2] = ref.s[rend+N_REF_PAD+1]; + i = 0; + } + else if ( npad==1 ) + { + tmp[2] = ref.s[rend+N_REF_PAD]; + i = 1; + } + else + i = 2; + + end = seq.s + seq.l; + for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); +#if DBG>1 + fprintf(stderr,"\t i=%d\n", i); + if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]); + if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]); +#endif + if ( i==-1 ) + { +#if DBG>1 + fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); +#endif + kputc_(cdna2aa(tmp), tseq); + codon = end - 3; + while ( codon >= seq.s ) + { + kputc_(cdna2aa(codon), tseq); +#if DBG>1 + fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); +#endif + codon -= 3; + } + if ( seq.s-codon==2 ) + { + tmp[2] = seq.s[0]; + i = 1; + } + else if ( seq.s-codon==1 ) + { + tmp[1] = seq.s[0]; + tmp[2] = seq.s[1]; + i = 0; + } + else + i = -1; +#if DBG>1 + if(i==1)fprintf(stderr,"[3] %c\n",tmp[2]); + if(i==0)fprintf(stderr,"[3] %c%c\n",tmp[1],tmp[2]); +#endif + } + // left padding + end = ref.s + N_REF_PAD + rbeg; + if ( i>=0 ) + { + for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end); + kputc_(cdna2aa(tmp), tseq); +#if DBG>1 + fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); +#endif + } + if ( fill!=0 ) + { + codon = end - 3; + while ( codon >= ref.s + N_REF_PAD ) + { + kputc_(cdna2aa(codon), tseq); +#if DBG>1 + fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); +#endif + codon -= 3; + } + } + } + kputc_(0,tseq); tseq->l--; +#if DBG + fprintf(stderr," tseq: %s\n", tseq->s); +#endif +} + +void tscript_splice_ref(tscript_t *tr) +{ + int i, len = 0; + for (i=0; incds; i++) + len += tr->cds[i]->len; + + tr->nsref = len + 2*N_REF_PAD; + tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); + len = 0; + + memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); + len += N_REF_PAD; + + for (i=0; incds; i++) + { + memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); + len += tr->cds[i]->len; + } + memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); + len += N_REF_PAD; + + tr->sref[len] = 0; +} + +// returns: 0 if consequence was added, 1 if it already exists or could not be added +int csq_push(args_t *args, csq_t *csq, bcf1_t *rec) +{ +#if XDBG +fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); +#endif + khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos); + vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k); + if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr); + + int i; + for (i=0; in; i++) + if ( vbuf->vrec[i]->line==rec ) break; + if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr); + vrec_t *vrec = vbuf->vrec[i]; + + // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor + if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) + csq->type.type &= ~CSQ_SPLICE_REGION; + + if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) + { + for (i=0; invcsq; i++) + { + // Same as below, to avoid records like + // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|- + // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C + if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) + { + vrec->vcsq[i] = csq->type; + goto exit_duplicate; + } + if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue; + if ( csq->type.ref != vrec->vcsq[i].ref ) continue; + goto exit_duplicate; + } + } + else if ( csq->type.type & CSQ_COMPOUND ) + { + for (i=0; invcsq; i++) + { + if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue; + if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; + if ( csq->type.gene != vrec->vcsq[i].gene ) continue; + if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) + { + // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function + // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered + // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two + // consequences: + // stop_lost|AL627309.1|ENST00000423372|protein_coding|- + // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA + if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) + { + if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) + { + vrec->vcsq[i].type |= csq->type.type; + + // remove stop_lost&synonymous if stop_retained set + if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) + vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); + + if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; + goto exit_duplicate; + } + continue; + } + if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue; + } + vrec->vcsq[i].type |= csq->type.type; + goto exit_duplicate; + } + } + else + { + for (i=0; invcsq; i++) + { + if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue; + if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; + if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) + { + vrec->vcsq[i].type |= csq->type.type; + goto exit_duplicate; + } + if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate; + } + } + // no such csq yet in this vcf record + csq->vrec = vrec; + csq->idx = i; + vrec->nvcsq++; + hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq); + vrec->vcsq[i] = csq->type; + return 0; + +exit_duplicate: + csq->vrec = vrec; + csq->idx = i; + return 1; +} + +// soff .. position of the variant within the trimmed query transcript +// sbeg .. position of the variant within the query transcript +// rbeg .. position on the reference transcript (if there are no indels, then rbeg=send) +// rpos .. VCF position +#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen)) +#define node2sbeg(i) (hap->sbeg + node2soff(i)) +#define node2send(i) (hap->sbeg + hap->stack[i].slen) +#define node2rbeg(i) (hap->stack[i].node->sbeg) +#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen) +#define node2rpos(i) (hap->stack[i].node->rec->pos) + +void kput_vcsq(vcsq_t *csq, kstring_t *str) +{ + // Remove start/stop from incomplete CDS, but only if there is another + // consequence as something must be reported + if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS); + + // Remove missense from start/stops + if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT; + + if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref ) + { + kputc_('@',str); + kputw(csq->ref->pos+1, str); + return; + } + if ( csq->type & CSQ_UPSTREAM_STOP ) + kputc_('*',str); + + int i, n = sizeof(csq_strings)/sizeof(char*); + for (i=1; itype&(1<type&(1<gene ) kputs(csq->gene , str); + + kputc_('|', str); + if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid)); + + kputc_('|', str); + kputs(gf_type2gff_string(csq->biotype), str); + + if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l ) + kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str); + + if ( csq->vstr.l ) + kputs(csq->vstr.s, str); +} + +void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) +{ + int i; + tscript_t *tr = hap->tr; + int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; + + int icsq = node->ncsq_list++; + hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); + csq_t *csq = &node->csq_list[icsq]; + csq->pos = hap->stack[ref_node].node->rec->pos; + csq->type.trid = tr->id; + csq->type.gene = tr->gene->name; + csq->type.strand = tr->strand; + csq->type.biotype = tr->type; + + // only now we see the translated sequence and can determine if the stop/start changes are real + int rm_csq = 0; + csq->type.type = 0; + for (i=ibeg; i<=iend; i++) + csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND; + if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING; + + int has_upstream_stop = hap->upstream_stop; + if ( hap->stack[ibeg].node->type != HAP_SSS ) + { + // check for truncating stops + for (i=0; itref.l; i++) + if ( hap->tref.s[i]=='*' ) break; + if ( i!=hap->tref.l ) + { + hap->tref.l = i+1; + hap->tref.s[i+1] = 0; + } + for (i=0; itseq.l; i++) + if ( hap->tseq.s[i]=='*' ) break; + if ( i!=hap->tseq.l ) + { + hap->tseq.l = i+1; + hap->tseq.s[i+1] = 0; + hap->upstream_stop = 1; + } + if ( csq->type.type & CSQ_STOP_LOST ) + { + if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) + { + rm_csq |= CSQ_STOP_LOST; + csq->type.type |= CSQ_STOP_RETAINED; + } + else if ( hap->tref.s[hap->tref.l-1]!='*' ) + { + // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense + // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf + if ( hap->tseq.s[hap->tseq.l-1] == '*' ) + { + rm_csq |= CSQ_STOP_GAINED; + csq->type.type |= CSQ_STOP_RETAINED; + } + else + csq->type.type |= CSQ_INCOMPLETE_CDS; + } + } + if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' ) + { + rm_csq |= CSQ_START_LOST; + csq->type.type &= ~CSQ_START_LOST; + } + if ( dlen!=0 ) + { + if ( dlen%3 ) + csq->type.type |= CSQ_FRAMESHIFT_VARIANT; + else if ( dlen<0 ) + csq->type.type |= CSQ_INFRAME_DELETION; + else + csq->type.type |= CSQ_INFRAME_INSERTION; + } + else + { + for (i=0; itref.l; i++) + if ( hap->tref.s[i] != hap->tseq.s[i] ) break; + if ( i==hap->tref.l ) + csq->type.type |= CSQ_SYNONYMOUS_VARIANT; + else if ( hap->tref.s[i] == '*' ) + csq->type.type |= CSQ_STOP_LOST; + else if ( hap->tseq.s[i] == '*' ) + csq->type.type |= CSQ_STOP_GAINED; + else + csq->type.type |= CSQ_MISSENSE_VARIANT; + } + } + if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; + csq->type.type &= ~rm_csq; + + if ( hap->stack[ibeg].node->type == HAP_SSS ) + { + node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq; + node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec; + node->csq_list[icsq].type.biotype = tr->type; + csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec); + return; + } + + kstring_t str = node->csq_list[icsq].type.vstr; + str.l = 0; + + // create the aa variant string + int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; + kputc_('|', &str); + kputw(aa_rbeg, &str); + kputs(hap->tref.s, &str); + if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) + { + kputc_('>', &str); + kputw(aa_sbeg, &str); + kputs(hap->tseq.s, &str); + } + kputc_('|', &str); + + // create the dna variant string and, in case of combined variants, + // insert silent CSQ_PRINTED_UPSTREAM variants + for (i=ibeg; i<=iend; i++) + { + if ( i>ibeg ) kputc_('+', &str); + kputw(node2rpos(i)+1, &str); + kputs(hap->stack[i].node->var, &str); + } + node->csq_list[icsq].type.vstr = str; + csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec); + + for (i=ibeg; i<=iend; i++) + { + // csq are printed at one position only for combined variants, the rest is + // silent and references the first + if ( hap->stack[i].node->csq & ~CSQ_COMPOUND ) + { + node->ncsq_list++; + hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); + csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; + tmp_csq->pos = hap->stack[i].node->rec->pos; + tmp_csq->type.trid = tr->id; + tmp_csq->type.gene = tr->gene->name; + tmp_csq->type.strand = tr->strand; + tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq; + tmp_csq->type.biotype = tr->type; + tmp_csq->type.vstr.l = 0; + kputs(str.s,&tmp_csq->type.vstr); + csq_push(args, tmp_csq, hap->stack[i].node->rec); + } + if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) ) + { + node->ncsq_list++; + hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); + csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; + tmp_csq->pos = hap->stack[i].node->rec->pos; + tmp_csq->type.trid = tr->id; + tmp_csq->type.gene = tr->gene->name; + tmp_csq->type.strand = tr->strand; + tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq; + tmp_csq->type.biotype = tr->type; + tmp_csq->type.ref = hap->stack[ref_node].node->rec; + tmp_csq->type.vstr.l = 0; + csq_push(args, tmp_csq, hap->stack[i].node->rec); + } + } +} + +void hap_finalize(args_t *args, hap_t *hap) +{ + tscript_t *tr = hap->tr; + if ( !tr->sref ) + tscript_splice_ref(tr); + + kstring_t sref; + sref.s = tr->sref; + sref.l = tr->nsref; + sref.m = sref.l; + + int istack = 0; + hts_expand(hstack_t,1,hap->mstack,hap->stack); + + hap->sseq.l = 0; + hap->tseq.l = 0; + hap->stack[0].node = tr->root; + hap->stack[0].ichild = -1; + hap->stack[0].slen = 0; + hap->stack[0].dlen = 0; + + while ( istack>=0 ) + { + hstack_t *stack = &hap->stack[istack]; + hap_node_t *node = hap->stack[istack].node; + while ( ++hap->stack[istack].ichild < node->nchild ) + { + if ( node->child[stack->ichild] ) break; + } + if ( stack->ichild == node->nchild ) { istack--; continue; } + + node = node->child[stack->ichild]; + + istack++; + hts_expand(hstack_t,istack+1,hap->mstack,hap->stack); + stack = &hap->stack[istack-1]; + + hap->stack[istack].node = node; + hap->stack[istack].ichild = -1; + + hap->sseq.l = stack->slen; + if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq); + hap->stack[istack].slen = hap->sseq.l; + hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen; + + if ( !node->nend ) continue; // not a leaf node + + // The spliced sequence has been built for the current haplotype and stored + // in hap->sseq. Now we break it and output as independent parts + + kstring_t sseq; + sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript + hap->upstream_stop = 0; + + int i = 1, dlen = 0, ibeg, indel = 0; + while ( istack[i].node->type == HAP_SSS ) i++; + hap->sbeg = hap->stack[i].node->sbeg; + + if ( tr->strand==STRAND_FWD ) + { + i = 0, ibeg = -1; + while ( ++i <= istack ) + { + if ( hap->stack[i].node->type == HAP_SSS ) + { + // start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping + hap_add_csq(args,hap,node,0,i,i,0,0); + continue; + } + dlen += hap->stack[i].node->dlen; + if ( hap->stack[i].node->dlen ) indel = 1; + if ( isseq.l ) + { + sseq.l = hap->stack[i].slen - ioff; + sseq.s = hap->sseq.s + ioff; + } + else // splice site overlap, see #1475227917 + sseq.l = fill = 0; + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + + // ref + sseq.l = node2rend(i) - rbeg; + sseq.s = sref.s + N_REF_PAD + rbeg; + sseq.m = sref.m - 2*N_REF_PAD; + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; + + hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel); + ibeg = -1; + dlen = 0; + indel = 0; + } + } + else + { + i = istack + 1, ibeg = -1; + while ( --i > 0 ) + { + if ( hap->stack[i].node->type == HAP_SSS ) + { + hap_add_csq(args,hap,node,0,i,i,0,0); + continue; + } + dlen += hap->stack[i].node->dlen; + if ( hap->stack[i].node->dlen ) indel = 1; + if ( i>1 && hap->stack[i-1].node->type != HAP_SSS ) + { + if ( dlen%3 ) + { + if ( ibeg==-1 ) ibeg = i; + continue; + } + int icur = sseq.m - 1 - node2sbeg(i); + int inext = sseq.m - 1 - node2sbeg(i-1); + if ( icur/3 == inext/3 ) + { + if ( ibeg==-1 ) ibeg = i; + continue; + } + } + if ( ibeg<0 ) ibeg = i; + int ioff = node2soff(i); + int icur = node2sbeg(i); + int rbeg = node2rbeg(i); + int rend = node2rend(ibeg); + int fill = dlen%3; + + // alt + if ( hap->sseq.l ) + { + sseq.l = hap->stack[ibeg].slen - ioff; + sseq.s = hap->sseq.s + ioff; + } + else // splice site overlap, see #1475227917 + sseq.l = fill = 0; + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + + // ref + sseq.l = node2rend(ibeg) - rbeg; + sseq.s = sref.s + N_REF_PAD + rbeg; + sseq.m = sref.m - 2*N_REF_PAD; + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; + + hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel); + ibeg = -1; + dlen = 0; + indel = 0; + } + } + } +} + +static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) +{ + if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return; + + char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-"; + const char *chr = bcf_hdr_id2name(args->hdr,args->rid); + + fprintf(args->out,"CSQ\t%s\t", smpl); + if ( ihap>0 ) + fprintf(args->out,"%d", ihap); + else + fprintf(args->out,"-"); + + args->str.l = 0; + kput_vcsq(&csq->type, &args->str); + fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); +} +static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +{ + if ( !node || !node->ncsq_list ) return; + + char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-"; + const char *chr = bcf_hdr_id2name(args->hdr,args->rid); + + int i; + for (i=0; incsq_list; i++) + { + csq_t *csq = node->csq_list + i; + if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue; + assert( csq->type.vstr.l ); + + fprintf(args->out,"CSQ\t%s\t", smpl); + if ( ihap>0 ) + fprintf(args->out,"%d", ihap); + else + fprintf(args->out,"-"); + + args->str.l = 0; + kput_vcsq(&csq->type, &args->str); + fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); + } +} + +static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +{ + if ( !node || !node->ncsq_list || ismpl<0 ) return; + + int i; + for (i=0; incsq_list; i++) + { + csq_t *csq = node->csq_list + i; + vrec_t *vrec = csq->vrec; + int icsq = 2*csq->idx + ihap; + if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + { + int print_warning = 1; + if ( args->quiet ) + { + if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; + args->ncsq_small_warned = 1; + } + if ( print_warning ) + { + fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", + args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); + if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n"); + } + break; + } + if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; + vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + } +} + +void hap_flush(args_t *args, uint32_t pos) +{ + int i,j; + tr_heap_t *heap = args->active_tr; + + while ( heap->ndat && heap->dat[0]->end<=pos ) + { + tscript_t *tr = heap->dat[0]; + khp_delete(trhp, heap); + + args->hap->tr = tr; + if ( tr->root && tr->root->nchild ) // normal, non-localized calling + { + hap_finalize(args, args->hap); + + if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf + { + if ( args->phase==PHASE_DROP_GT ) + hap_print_text(args, tr, -1,0, tr->hap[0]); + else + { + for (i=0; ismpl->n; i++) + { + for (j=0; j<2; j++) + hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]); + } + } + } + else if ( args->phase!=PHASE_DROP_GT ) + { + for (i=0; ismpl->n; i++) + { + for (j=0; j<2; j++) + hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]); + } + } + } + + // mark the transcript for deletion. Cannot delete it immediately because + // by-position VCF output will need them when flushed by vcf_buf_push + args->nrm_tr++; + hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); + args->rm_tr[args->nrm_tr-1] = tr; + } +} + +#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } + +void vbuf_push(args_t *args, bcf1_t **rec_ptr) +{ + int i; + + assert(rec_ptr); + bcf1_t *rec = *rec_ptr; + + // check for duplicate records + i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1; + if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) + { + // vcf record with a new pos + rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf); + i = rbuf_append(&args->vcf_rbuf); + if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); + args->vcf_buf[i]->n = 0; + } + vbuf_t *vbuf = args->vcf_buf[i]; + vbuf->n++; + hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec); + if ( !vbuf->vrec[vbuf->n - 1] ) + vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t)); + + vrec_t *vrec = vbuf->vrec[vbuf->n - 1]; + if ( args->phase!=PHASE_DROP_GT && args->smpl->n ) + { + if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq); + else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq); + } + if ( !vrec->line ) vrec->line = bcf_init1(); + SWAP(bcf1_t*, (*rec_ptr), vrec->line); + + int ret; + khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); + kh_val(args->pos2vbuf,k) = vbuf; +} + +void vbuf_flush(args_t *args) +{ + if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone + + int i,j; + while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 ) + { + vbuf_t *vbuf = args->vcf_buf[i]; + for (i=0; in; i++) + { + vrec_t *vrec = vbuf->vrec[i]; + if ( !args->out_fh ) // not a VCF output + { + vrec->nvcsq = 0; + continue; + } + if ( !vrec->nvcsq ) + { + bcf_write(args->out_fh, args->hdr, vrec->line); + continue; + } + + args->str.l = 0; + kput_vcsq(&vrec->vcsq[0], &args->str); + for (j=1; jnvcsq; j++) + { + kputc_(',', &args->str); + kput_vcsq(&vrec->vcsq[j], &args->str); + } + bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s); + if ( args->hdr_nsmpl ) + { + if ( vrec->nfmt < args->nfmt_bcsq ) + for (j=1; jhdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl)); + bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); + } + vrec->nvcsq = 0; + bcf_write(args->out_fh, args->hdr, vrec->line); + } + if ( vbuf->n ) + { + khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos); + if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); + } + vbuf->n = 0; + } + + for (i=0; inrm_tr; i++) + { + tscript_t *tr = args->rm_tr[i]; + if ( tr->root ) hap_destroy(tr->root); + tr->root = NULL; + free(tr->hap); + free(tr->ref); + free(tr->sref); + } + args->nrm_tr = 0; + args->ncsq_buf = 0; +} + +void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) +{ + int i, len; + int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; + + tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + if ( !tr->ref ) + error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); + + int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); + if ( pad_beg + pad_end != 2*N_REF_PAD ) + { + char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD); + for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; + memcpy(ref+i, tr->ref, len); + for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; + free(tr->ref); + tr->ref = ref; + } +} + +static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) +{ + char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0); + char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos); + assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) ); + while ( *ref && *vcf ) + { + if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) ) + error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]); + ref++; + vcf++; + } +} + +int test_cds_local(args_t *args, bcf1_t *rec) +{ + int i,j, ret = 0; + const char *chr = bcf_seqname(args->hdr,rec); + // note that the off-by-one extension of rlen is deliberate to account for insertions + if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + + // structures to fake the normal test_cds machinery + hap_node_t root, node; + root.type = HAP_ROOT; + kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq; + + while ( regitr_overlap(args->itr) ) + { + gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); + tscript_t *tr = cds->tr; + if ( !GF_is_coding(tr->type) ) continue; + ret = 1; + + if ( !tr->ref ) + { + tscript_init_ref(args, tr, chr); + tscript_splice_ref(tr); + khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards + } + + sanity_check_ref(args, tr, rec); + + kstring_t sref; + sref.s = tr->sref; + sref.l = tr->nsref; + sref.m = sref.l; + + for (i=1; in_allele; i++) + { + if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; + + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + + int csq_type = node.csq; + + // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though + if ( node.type == HAP_SSS ) + { + csq.type.type = csq_type; + csq_stage(args, &csq, rec); + } + else + { + kstring_t sseq; + sseq.m = sref.m - 2*N_REF_PAD + node.dlen; + sseq.s = node.seq; + int alen = sseq.l = strlen(sseq.s); + int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917 + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill); + + sseq.m = sref.m - 2*N_REF_PAD; + sseq.s = sref.s + N_REF_PAD + node.sbeg; + sseq.l = node.rlen; + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill); + + // check for truncating stops + for (j=0; jl; j++) + if ( tref->s[j]=='*' ) break; + if ( j!=tref->l ) + { + tref->l = j+1; + tref->s[j+1] = 0; + } + for (j=0; jl; j++) + if ( tseq->s[j]=='*' ) break; + if ( j!=tseq->l ) + { + tseq->l = j+1; + tseq->s[j+1] = 0; + } + if ( csq_type & CSQ_STOP_LOST ) + { + if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) + { + csq_type &= ~CSQ_STOP_LOST; + csq_type |= CSQ_STOP_RETAINED; + } + else if (tref->s[tref->l-1]!='*' ) + { + // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense + // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf + if ( tseq->s[tseq->l-1] == '*' ) + { + csq_type &= ~CSQ_STOP_GAINED; + csq_type |= CSQ_STOP_RETAINED; + } + else + csq_type |= CSQ_INCOMPLETE_CDS; + } + } + if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' ) + csq_type &= ~CSQ_START_LOST; + if ( node.dlen!=0 ) + { + if ( node.dlen%3 ) + csq_type |= CSQ_FRAMESHIFT_VARIANT; + else if ( node.dlen<0 ) + csq_type |= CSQ_INFRAME_DELETION; + else + csq_type |= CSQ_INFRAME_INSERTION; + } + else + { + for (j=0; jl; j++) + if ( tref->s[j] != tseq->s[j] ) break; + if ( j==tref->l ) + csq_type |= CSQ_SYNONYMOUS_VARIANT; + else if ( tref->s[j] == '*' ) + csq_type |= CSQ_STOP_LOST; + else if ( tseq->s[j] == '*' ) + csq_type |= CSQ_STOP_GAINED; + else + csq_type |= CSQ_MISSENSE_VARIANT; + } + if ( csq_type & CSQ_COMPOUND ) + { + // create the aa variant string + kstring_t str = {0,0,0}; + int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; + kputc_('|', &str); + kputw(aa_rbeg, &str); + kputs(tref->s, &str); + if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) + { + kputc_('>', &str); + kputw(aa_sbeg, &str); + kputs(tseq->s, &str); + } + kputc_('|', &str); + kputw(rec->pos+1, &str); + kputs(node.var, &str); + csq.type.vstr = str; + csq.type.type = csq_type & CSQ_COMPOUND; + csq_stage(args, &csq, rec); + + // all this only to clean vstr when vrec is flushed + if ( !tr->root ) + tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + tr->root->ncsq_list++; + hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); + csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; + rm_csq->type.vstr = str; + } + if ( csq_type & ~CSQ_COMPOUND ) + { + csq.type.type = csq_type & ~CSQ_COMPOUND; + csq.type.vstr.l = 0; + csq_stage(args, &csq, rec); + } + } + free(node.seq); + free(node.var); + } + } + return ret; +} + +int test_cds(args_t *args, bcf1_t *rec) +{ + int i, ret = 0, hap_ret; + const char *chr = bcf_seqname(args->hdr,rec); + // note that the off-by-one extension of rlen is deliberate to account for insertions + if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + while ( regitr_overlap(args->itr) ) + { + gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); + tscript_t *tr = cds->tr; + if ( !GF_is_coding(tr->type) ) continue; + ret = 1; + if ( !tr->root ) + { + // initialize the transcript and its haplotype tree, fetch the reference sequence + tscript_init_ref(args, tr, chr); + + tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid + tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*)); + for (i=0; inhap; i++) tr->hap[i] = NULL; + tr->root->nend = tr->nhap; + tr->root->type = HAP_ROOT; + + khp_insert(trhp, args->active_tr, &tr); + } + + sanity_check_ref(args, tr, rec); + + if ( args->phase==PHASE_DROP_GT ) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root; + hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); + if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 ) + { + // overlapping or intron variant, cannot apply + if ( hap_ret==1 ) + { + if ( !args->quiet ) + fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + if ( args->out ) + fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + } + else ret = 1; // prevent reporting as intron in test_tscript + free(child); + continue; + } + parent->nend--; + parent->nchild = 1; + parent->mchild = 1; + parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*)); + parent->child[0] = child; + tr->hap[0] = child; + tr->hap[0]->nend = 1; + continue; + } + + // apply the VCF variants and extend the haplotype tree + int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); + ngts /= bcf_hdr_nsamples(args->hdr); + if ( ngts!=1 && ngts!=2 ) + { + if ( !args->quiet ) + fprintf(stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + if ( args->out ) + fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + continue; + } + for (ismpl=0; ismplsmpl->n; ismpl++) + { + int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts; + if ( gt[0]==bcf_gt_missing ) continue; + + if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end ) + { + if ( args->phase==PHASE_MERGE ) + { + if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1]; + } + if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) + { + if ( args->phase==PHASE_REQUIRE ) + error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); + if ( args->phase==PHASE_SKIP ) + continue; + if ( args->phase==PHASE_NON_REF ) + { + if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1]; + else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0]; + } + } + } + + for (ihap=0; ihapn_allele ); + if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; } + + hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root; + if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 ) + { + // this haplotype has been seen in another sample + tr->hap[i] = parent->child[ parent->cur_child[ial] ]; + tr->hap[i]->nend++; + parent->nend--; + continue; + } + + hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); + if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 ) + { + // overlapping or intron variant, cannot apply + if ( hap_ret==1 ) + { + if ( !args->quiet ) + fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", + chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + if ( args->out ) + fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", + chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + } + free(child); + continue; + } + + if ( parent->cur_rec!=rec ) + { + hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child); + for (j=0; jn_allele; j++) parent->cur_child[j] = -1; + parent->cur_rec = rec; + } + + j = parent->nchild++; + hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child); + parent->cur_child[ial] = j; + parent->child[j] = child; + tr->hap[i] = child; + tr->hap[i]->nend++; + parent->nend--; + } + } + } + return ret; +} + +void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) +{ + // known issues: tab output leads to unsorted output. This is because + // coding haplotypes are printed in one go and buffering is not used + // with tab output. VCF output is OK though. + if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists + + int i,j,ngt = 0; + if ( args->phase!=PHASE_DROP_GT ) + { + ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); + if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr); + } + if ( ngt<=0 ) + { + if ( args->output_type==FT_TAB_TEXT ) + csq_print_text(args, csq, -1,0); + return; + } + assert( ngt<=2 ); + + if ( args->output_type==FT_TAB_TEXT ) + { + for (i=0; ismpl->n; i++) + { + int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; + for (j=0; jsmpl->idx[i],j+1); + } + } + return; + } + + vrec_t *vrec = csq->vrec; + for (i=0; ismpl->n; i++) + { + int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; + for (j=0; jidx + j; + if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + { + int ismpl = args->smpl->idx[i]; + int print_warning = 1; + if ( args->quiet ) + { + if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; + args->ncsq_small_warned = 1; + } + if ( print_warning ) + { + fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", + args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); + if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n"); + } + break; + } + if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; + vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + } + } +} +int test_utr(args_t *args, bcf1_t *rec) +{ + const char *chr = bcf_seqname(args->hdr,rec); + // note that the off-by-one extension of rlen is deliberate to account for insertions + if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + + splice_t splice; + splice_init(&splice, rec); + + int i, ret = 0; + while ( regitr_overlap(args->itr) ) + { + gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); + tscript_t *tr = splice.tr = utr->tr; + for (i=1; in_allele; i++) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + splice.vcf.alt = rec->d.allele[i]; + int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); + if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); + ret = 1; + } + } + assert(!splice.kref.s); + assert(!splice.kalt.s); + return ret; +} +int test_splice(args_t *args, bcf1_t *rec) +{ + const char *chr = bcf_seqname(args->hdr,rec); + if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; + + splice_t splice; + splice_init(&splice, rec); + splice.check_acceptor = splice.check_donor = 1; + + int i, ret = 0; + while ( regitr_overlap(args->itr) ) + { + gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); + splice.tr = exon->tr; + if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites + + splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; + splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; + + for (i=1; in_allele; i++) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + splice.vcf.alt = rec->d.allele[i]; + splice_csq(args, &splice, exon->beg, exon->end); + if ( splice.csq ) ret = 1; + } + } + free(splice.kref.s); + free(splice.kalt.s); + return ret; +} +int test_tscript(args_t *args, bcf1_t *rec) +{ + const char *chr = bcf_seqname(args->hdr,rec); + if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + + splice_t splice; + splice_init(&splice, rec); + + int i, ret = 0; + while ( regitr_overlap(args->itr) ) + { + tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + for (i=1; in_allele; i++) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + splice.vcf.alt = rec->d.allele[i]; + int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); + if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); + ret = 1; + } + } + assert(!splice.kref.s); + assert(!splice.kalt.s); + return ret; +} + +void process(args_t *args, bcf1_t **rec_ptr) +{ + if ( !rec_ptr ) + { + hap_flush(args, REGIDX_MAX); + vbuf_flush(args); + return; + } + + bcf1_t *rec = *rec_ptr; + + int call_csq = 1; + if ( !rec->n_allele ) call_csq = 0; // no alternate allele + else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele + else if ( args->filter ) + { + call_csq = filter_test(args->filter, rec, NULL); + if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; + } + if ( !call_csq ) + { + if ( !args->out_fh ) return; // not a VCF output + vbuf_push(args, rec_ptr); + vbuf_flush(args); + return; + } + + if ( args->rid != rec->rid ) + { + hap_flush(args, REGIDX_MAX); + vbuf_flush(args); + } + args->rid = rec->rid; + vbuf_push(args, rec_ptr); + + int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec); + hit += test_utr(args, rec); + hit += test_splice(args, rec); + if ( !hit ) test_tscript(args, rec); + + hap_flush(args, rec->pos-1); + vbuf_flush(args); + + return; +} + +const char *usage(void) +{ + return + "\n" + "About: Haplotype-aware consequence caller.\n" + "Usage: bcftools csq [options] in.vcf\n" + "\n" + "Required options:\n" + " -f, --fasta-ref reference file in fasta format\n" + " -g, --gff-annot gff3 annotation file\n" + "\n" + "CSQ options:\n" + " -c, --custom-tag use this tag instead of the default BCSQ\n" + " -l, --local-csq localized predictions, consider only one VCF record at a time\n" + " -n, --ncsq maximum number of consequences to consider per site [16]\n" + " -p, --phase how to construct haplotypes and how to deal with unphased data: [r]\n" + " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" + " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" + " r: require phased GTs, throw an error on unphased het GTs\n" + " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" + " s: skip unphased GTs\n" + "Options:\n" + " -e, --exclude exclude sites for which the expression is true\n" + " -i, --include select sites for which the expression is true\n" + " -o, --output write output to a file [standard output]\n" + " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" + " -q, --quiet suppress warning messages. Can be given two times for even less messages\n" + " -r, --regions restrict to comma-separated list of regions\n" + " -R, --regions-file restrict to regions listed in a file\n" + " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file samples to include\n" + " -t, --targets similar to -r but streams rather than index-jumps\n" + " -T, --targets-file similar to -R but streams rather than index-jumps\n" + "\n" + "Example:\n" + " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" + "\n" + " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n" + " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n" + " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n" + "\n"; +} + +int main_csq(int argc, char *argv[]) +{ + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->output_type = FT_VCF; + args->bcsq_tag = "BCSQ"; + args->ncsq_max = 2*16; + + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"ncsq",1,0,'n'}, + {"custom-tag",1,0,'c'}, + {"local-csq",0,0,'l'}, + {"gff-annot",1,0,'g'}, + {"fasta-ref",1,0,'f'}, + {"include",1,0,'i'}, + {"exclude",1,0,'e'}, + {"output",1,0,'o'}, + {"output-type",1,NULL,'O'}, + {"phase",1,0,'p'}, + {"quiet",0,0,'q'}, + {"regions",1,0,'r'}, + {"regions-file",1,0,'R'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, + {"targets",1,0,'t'}, + {"targets-file",1,0,'T'}, + {0,0,0,0} + }; + int c, targets_is_file = 0, regions_is_file = 0; + char *targets_list = NULL, *regions_list = NULL; + while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0) + { + switch (c) + { + case 'l': args->local_csq = 1; break; + case 'c': args->bcsq_tag = optarg; break; + case 'q': args->quiet++; break; + case 'p': + switch (optarg[0]) + { + case 'a': args->phase = PHASE_AS_IS; break; + case 'm': args->phase = PHASE_MERGE; break; + case 'r': args->phase = PHASE_REQUIRE; break; + case 'R': args->phase = PHASE_NON_REF; break; + case 's': args->phase = PHASE_SKIP; break; + default: error("The -p code \"%s\" not recognised\n", optarg); + } + break; + case 'f': args->fa_fname = optarg; break; + case 'g': args->gff_fname = optarg; break; + case 'n': + args->ncsq_max = 2 * atoi(optarg); + if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg); + break; + case 'o': args->output_fname = optarg; break; + case 'O': + switch (optarg[0]) { + case 't': args->output_type = FT_TAB_TEXT; break; + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: error("The output type \"%s\" not recognised\n", optarg); + } + break; + case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'r': regions_list = optarg; break; + case 'R': regions_list = optarg; regions_is_file = 1; break; + case 's': args->sample_list = optarg; break; + case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; + case 't': targets_list = optarg; break; + case 'T': targets_list = optarg; targets_is_file = 1; break; + case 'h': + case '?': error("%s",usage()); + default: error("The option not recognised: %s\n\n", optarg); break; + } + } + char *fname = NULL; + if ( optind==argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + else error("%s", usage()); + } + else fname = argv[optind]; + if ( argc - optind>1 ) error("%s", usage()); + if ( !args->fa_fname ) error("Missing the --fa-ref option\n"); + if ( !args->gff_fname ) error("Missing the --gff option\n"); + args->sr = bcf_sr_init(); + if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", targets_list); + if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", regions_list); + if ( !bcf_sr_add_reader(args->sr, fname) ) + error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum)); + args->hdr = bcf_sr_get_header(args->sr,0); + + init_data(args); + while ( bcf_sr_next_line(args->sr) ) + { + process(args, &args->sr->readers[0].buffer[0]); + } + process(args,NULL); + + destroy_data(args); + bcf_sr_destroy(args->sr); + free(args); + + return 0; +} + diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c new file mode 100644 index 0000000..b79a030 --- /dev/null +++ b/bcftools/csq.c.pysam.c @@ -0,0 +1,3826 @@ +#include "pysam.h" + +/* The MIT License + + Copyright (c) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + Things that would be nice to have + - for stop-lost events (also in frameshifts) report the number of truncated aa's + - memory could be greatly reduced by indexing gff (but it is quite compact already) + - deletions that go beyond transcript boundaries are not checked at sequence level + - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16 + - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882 + + Read about transcript types here + http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html + http://www.ensembl.org/info/genome/variation/predicted_data.html + http://www.gencodegenes.org/gencode_biotypes.html + + List of supported biotypes + antisense + IG_C_gene + IG_D_gene + IG_J_gene + IG_LV_gene + IG_V_gene + lincRNA + macro_lncRNA + miRNA + misc_RNA + Mt_rRNA + Mt_tRNA + polymorphic_pseudogene + processed_transcript + protein_coding + ribozyme + rRNA + sRNA + scRNA + scaRNA + sense_intronic + sense_overlapping + snRNA + snoRNA + TR_C_gene + TR_D_gene + TR_J_gene + TR_V_gene + + The gff parsing logic + We collect features such by combining gff lines A,B,C as follows: + A .. gene line with a supported biotype + A.ID=~/^gene:/ + + B .. transcript line referencing A + B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/ + + C .. corresponding CDS, exon, and UTR lines: + C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ + + For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the + complete chain link C -> B -> A is required. For the rest, link B -> A suffices. + + + The supported consequence types, sorted by impact: + splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron) + splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron) + stop_gained .. DNA sequence variant resulting in a stop codon + frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame + stop_lost .. elongated transcript, stop codon changed + start_lost .. the first codon changed + inframe_altering .. combination of indels leading to unchanged reading frame and length + inframe_insertion .. inserted coding sequence, unchanged reading frame + inframe_deletion .. deleted coding sequence, unchanged reading frame + missense_variant .. amino acid (aa) change, unchanged length + splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron + synonymous_variant .. DNA sequence variant resulting in no amino acid change + stop_retained_variant .. different stop codon + non_coding_variant .. variant in non-coding sequence, such as RNA gene + 5_prime_UTR_variant + 3_prime_UTR_variant + intron_variant .. reported only if none of the above + intergenic_variant .. reported only if none of the above + + + The annotation algorithm. + The algorithm checks if the variant falls in a region of a supported type. The + search is performed in the following order, until a match is found: + 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences + 2. idx_utr(gf_utr_t) - check UTR hits + 3. idx_exon(gf_exon_t) - check for splice variants + 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc. + + These regidx indexes are created by parsing a gff3 file as follows: + 1. create the array "ftr" of all UTR, CDS, exons. This will be + processed later and pruned based on transcript types we want to keep. + In the same go, create the hash "id2tr" of transcripts to keep + (based on biotype) which maps from transcript_id to a transcript. At + the same time also build the hash "gid2gene" which maps from gene_id to + gf_gene_t pointer. + + 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes. + Use only features from "ftr" which are present in "id2tr". + + 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene. + + Data structures. + idx_cds, idx_utr, idx_exon, idx_tscript: + as described above, regidx structures for fast lookup of exons/transcripts + overlapping a region, the payload is a pointer to tscript.cds +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "filter.h" +#include "regidx.h" +#include "kheap.h" +#include "smpl_ilist.h" +#include "rbuf.h" + +#ifndef __FUNCTION__ +# define __FUNCTION__ __func__ +#endif + +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 + +// Definition of splice_region, splice_acceptor and splice_donor +#define N_SPLICE_DONOR 2 +#define N_SPLICE_REGION_EXON 3 +#define N_SPLICE_REGION_INTRON 8 + +// Ensembl ID format, e.g. +// ENST00000423372 for human .. ENST%011d +// ENSMUST00000120394 for mouse .. ENSMUST%011d +char ENSID_BUF[32], *ENSID_FMT = NULL; +static inline char *ENSID(uint32_t id) +{ + sprintf(ENSID_BUF,ENSID_FMT,id); + return ENSID_BUF; +} + + +#define N_REF_PAD 10 // number of bases to avoid boundary effects + +#define STRAND_REV 0 +#define STRAND_FWD 1 + +#define TRIM_NONE 0 +#define TRIM_5PRIME 1 +#define TRIM_3PRIME 2 + +// How to treat phased/unphased genotypes +#define PHASE_REQUIRE 0 // --phase r +#define PHASE_MERGE 1 // --phase m +#define PHASE_AS_IS 2 // --phase a +#define PHASE_SKIP 3 // --phase s +#define PHASE_NON_REF 4 // --phase R +#define PHASE_DROP_GT 5 // --samples - + +// Node types in the haplotype tree +#define HAP_CDS 0 +#define HAP_ROOT 1 +#define HAP_SSS 2 // start/stop/splice + +#define CSQ_PRINTED_UPSTREAM (1<<0) +#define CSQ_SYNONYMOUS_VARIANT (1<<1) +#define CSQ_MISSENSE_VARIANT (1<<2) +#define CSQ_STOP_LOST (1<<3) +#define CSQ_STOP_GAINED (1<<4) +#define CSQ_INFRAME_DELETION (1<<5) +#define CSQ_INFRAME_INSERTION (1<<6) +#define CSQ_FRAMESHIFT_VARIANT (1<<7) +#define CSQ_SPLICE_ACCEPTOR (1<<8) +#define CSQ_SPLICE_DONOR (1<<9) +#define CSQ_START_LOST (1<<10) +#define CSQ_SPLICE_REGION (1<<11) +#define CSQ_STOP_RETAINED (1<<12) +#define CSQ_UTR5 (1<<13) +#define CSQ_UTR3 (1<<14) +#define CSQ_NON_CODING (1<<15) +#define CSQ_INTRON (1<<16) +//#define CSQ_INTERGENIC (1<<17) +#define CSQ_INFRAME_ALTERING (1<<18) +#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string +#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf +#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence + +// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345 +#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ + CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ + CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ + CSQ_UPSTREAM_STOP) +#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST) + +#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) +#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) +#define CSQ_PRN_BIOTYPE CSQ_NON_CODING + +// see kput_vcsq() +const char *csq_strings[] = +{ + NULL, + "synonymous", + "missense", + "stop_lost", + "stop_gained", + "inframe_deletion", + "inframe_insertion", + "frameshift", + "splice_acceptor", + "splice_donor", + "start_lost", + "splice_region", + "stop_retained", + "5_prime_utr", + "3_prime_utr", + "non_coding", + "intron", + "intergenic", + "inframe_altering", + NULL, + NULL, + "coding_sequence" +}; + + +// GFF line types +#define GFF_TSCRIPT_LINE 1 +#define GFF_GENE_LINE 2 + + +/* + Genomic features, for fast lookup by position to overlapping features +*/ +#define GF_coding_bit 6 +#define GF_is_coding(x) ((x) & (1<5I|121ACG>A+124TA>T" + + vcrec_t + single VCF record and csq tied to this record. (Haplotype can have multiple + consequences in several VCF records. Each record can have multiple consequences + from multiple haplotypes.) + + csq_t + a top-level consequence tied to a haplotype + + vbuf_t + pos2vbuf + VCF records with the same position clustered together for a fast lookup via pos2vbuf +*/ +typedef struct _vbuf_t vbuf_t; +typedef struct _vcsq_t vcsq_t; +struct _vcsq_t +{ + uint32_t strand:1, + type:31; // one of CSQ_* types + uint32_t trid; + uint32_t biotype; // one of GF_* types + char *gene; // gene name + bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234" + kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T +}; +typedef struct +{ + bcf1_t *line; + uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved + uint32_t nfmt:4, nvcsq:28, mvcsq; + vcsq_t *vcsq; // there can be multiple consequences for a single VCF record +} +vrec_t; +typedef struct +{ + uint32_t pos; + vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf) + int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ + vcsq_t type; +} +csq_t; +struct _vbuf_t +{ + vrec_t **vrec; // buffer of VCF lines with the same position + int n, m; +}; +KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) + + +/* + Structures related to haplotype-aware consequences in coding regions + + hap_node_t + node of a haplotype tree. Each transcript has one tree + + tscript_t + despite its general name, it is intended for coding transcripts only + + hap_t + hstack_t + for traversal of the haplotype tree and braking combined + consequences into independent parts +*/ +typedef struct _hap_node_t hap_node_t; +struct _hap_node_t +{ + char *seq; // cds segment [parent_node,this_node) + char *var; // variant "ref>alt" + uint32_t type:2, // HAP_ROOT or HAP_CDS + csq:30; // this node's consequence + int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution + uint32_t rbeg; // variant's VCF position (0-based, inclusive) + int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types + uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included) + uint32_t icds; // which exon does this node's variant overlaps + hap_node_t **child, *prev; // children haplotypes and previous coding node + int nchild, mchild; + bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record + uint32_t nend; // number of haplotypes ending in this node + int *cur_child, mcur_child; // mapping from the allele to the currently active child + csq_t *csq_list; // list of haplotype's consequences, broken by position + int ncsq_list, mcsq_list; +}; +struct _tscript_t +{ + uint32_t id; // transcript id + uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive) + uint32_t strand:1, // STRAND_REV or STRAND_FWD + ncds:31, // number of exons + mcds; + gf_cds_t **cds; // ordered list of exons + char *ref; // reference sequence, padded with N_REF_PAD bases on both ends + char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends + hap_node_t *root; // root of the haplotype tree + hap_node_t **hap; // pointer to haplotype leaves, two for each sample + int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD + uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types + type:30; // one of GF_* types + gf_gene_t *gene; +}; +static inline int cmp_tscript(tscript_t **a, tscript_t **b) +{ + return ( (*a)->end < (*b)->end ) ? 1 : 0; +} +KHEAP_INIT(trhp, tscript_t*, cmp_tscript) +typedef khp_trhp_t tr_heap_t; +typedef struct +{ + hap_node_t *node; // current node + int ichild; // current child in the active node + int dlen; // total dlen, from the root to the active node + size_t slen; // total sequence length, from the root to the active node +} +hstack_t; +typedef struct +{ + int mstack; + hstack_t *stack; + tscript_t *tr; // tr->ref: spliced transcript on ref strand + kstring_t sseq; // spliced haplotype sequence on ref strand + kstring_t tseq; // the variable part of translated haplotype transcript, coding strand + kstring_t tref; // the variable part of translated reference transcript, coding strand + uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS + int upstream_stop; +} +hap_t; + + +/* + Helper structures, only for initialization + + ftr_t + temporary list of all exons, CDS, UTRs +*/ +KHASH_MAP_INIT_INT(int2tscript, tscript_t*) +KHASH_MAP_INIT_INT(int2int, int) +KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) +typedef struct +{ + int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR + uint32_t beg; + uint32_t end; + uint32_t trid; + uint32_t strand:1; // STRAND_REV,STRAND_FWD + uint32_t phase:2; // 0, 1 or 2 + uint32_t iseq:29; +} +ftr_t; +typedef struct +{ + // all exons, CDS, UTRs + ftr_t *ftr; + int nftr, mftr; + + // mapping from transcript ensembl id to gene id + kh_int2gene_t *gid2gene; + + // mapping from transcript id to tscript, for quick CDS anchoring + kh_int2tscript_t *id2tr; + + // sequences + void *seq2int; + char **seq; + int nseq, mseq; + + // ignored biotypes + void *ignored_biotypes; +} +aux_t; + +typedef struct _args_t +{ + // the main regidx lookups, from chr:beg-end to overlapping features and + // index iterator + regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + regitr_t *itr; + + // temporary structures, deleted after initializtion + aux_t init; + + // text tab-delimited output (out) or vcf/bcf output (out_fh) + FILE *out; + htsFile *out_fh; + + // vcf + bcf_srs_t *sr; + bcf_hdr_t *hdr; + int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values() + + // include or exclude sites which match the filters + filter_t *filter; + char *filter_str; + int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE + + // samples to process + int sample_is_file; + char *sample_list; + smpl_ilist_t *smpl; + + char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; + char *bcsq_tag; + int argc, output_type; + int phase, quiet, local_csq; + int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ + int ncsq_small_warned; + + int rid; // current chromosome + tr_heap_t *active_tr; // heap of active transcripts for quick flushing + hap_t *hap; // transcript haplotype recursion + vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush + rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf + kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position + tscript_t **rm_tr; // buffer of transcripts to clean + int nrm_tr, mrm_tr; + csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs + int ncsq_buf, mcsq_buf; + + faidx_t *fai; + kstring_t str, str2; + int32_t *gt_arr, mgt_arr; +} +args_t; + +// AAA, AAC, ... +const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; +const uint8_t nt4[] = +{ + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4, + 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 3 +}; +const uint8_t cnt4[] = +{ + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4, + 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, + 4,4,4,4, 0 +}; +#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] +#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] + +static const char *gf_strings_noncoding[] = +{ + "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", + "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", + "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", + "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" +}; +static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; +static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; + +const char *gf_type2gff_string(int type) +{ + if ( !GF_is_coding(type) ) + { + if ( type < (1<init; + char c = chr_end[1]; + chr_end[1] = 0; + int iseq; + if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + { + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = strdup(chr_beg); + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 256 ); // see gf_gene_t.iseq + } + chr_end[1] = c; + return iseq; +} +static inline char *gff_skip(const char *line, char *ss) +{ + while ( *ss && *ss!='\t' ) ss++; + if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return ss+1; +} +static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end) +{ + char *se = (char*) line; + while ( *se && *se!='\t' ) se++; + if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + *chr_beg = (char*) line; + *chr_end = se-1; +} +static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) +{ + char *se = ss; + *beg = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); + ss = se+1; + *end = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return se+1; +} +static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss) +{ + ss = strstr(ss,needle); + if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); + ss += strlen(needle); + while ( *ss && !isdigit(*ss) ) ss++; + if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); + char *se; + uint32_t id = strtol(ss, &se, 10); + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); + if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice + return id; +} +static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss) +{ + ss = strstr(ss,needle); + if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); + ss += strlen(needle); + char *se = ss; + while ( *se && !isdigit(*se) ) se++; + kstring_t str = {0,0,0}; + kputsn(ss,se-ss,&str); + ss = se; + while ( *se && isdigit(*se) ) se++; + ksprintf(&str,"%%0%dd",(int)(se-ss)); + ENSID_FMT = str.s; +} +static inline int gff_parse_type(char *line) +{ + line = strstr(line,"ID="); + if ( !line ) return -1; + line += 3; + if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE; + else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE; + return -1; +} +static inline int gff_parse_biotype(char *_line) +{ + char *line = strstr(_line,"biotype="); + if ( !line ) return -1; + + line += 8; + switch (*line) + { + case 'p': + if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; + else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; + else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; + else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; + break; + case 'a': + if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; + else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; + else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; + break; + case 'I': + if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C; + else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D; + else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J; + else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV; + else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V; + else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; + else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; + else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; + else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; + break; + case 'T': + if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C; + else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D; + else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J; + else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V; + else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; + else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; + break; + case 'M': + if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; + else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; + break; + case 'l': + if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; + break; + case 'm': + if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; + else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; + else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; + break; + case 'r': + if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; + else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; + else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; + else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; + break; + case 's': + if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; + else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; + else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; + else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; + else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; + else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; + else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; + break; + case 't': + if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; + break; + case 'n': + if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; + else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; + break; + case 'k': + if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; + break; + case 'u': + if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; + break; + case 'L': + if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; + break; + case '3': + if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; + break; + case 'd': + if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; + break; + case 'v': + if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; + break; + case 'b': + if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; + break; + } + return 0; +} +static inline int gff_ignored_biotype(args_t *args, char *ss) +{ + ss = strstr(ss,"biotype="); + if ( !ss ) return 0; + + ss += 8; + char *se = ss, tmp; + while ( *se && *se!=';' ) se++; + tmp = *se; + *se = 0; + + char *key = ss; + int n = 0; + if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); + khash_str2int_set(args->init.ignored_biotypes, key, n+1); + + *se = tmp; + return 1; +} +gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) +{ + khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); + gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); + if ( !gene ) + { + gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); + int ret; + k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); + kh_val(aux->gid2gene,k) = gene; + } + return gene; +} +void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) +{ + aux_t *aux = &args->init; + int biotype = gff_parse_biotype(ss); + if ( biotype <= 0 ) + { + if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored transcript: %s\n",line); + return; + } + + // create a mapping from transcript_id to gene_id + uint32_t trid = gff_parse_id(line, "ID=transcript:", ss); + uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss); + + if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species + + tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); + tr->id = trid; + tr->strand = ftr->strand; + tr->gene = gene_init(aux, gene_id); + tr->type = biotype; + tr->beg = ftr->beg; + tr->end = ftr->end; + + khint_t k; + int ret; + k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); + kh_val(aux->id2tr,k) = tr; +} +void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr) +{ + int biotype = gff_parse_biotype(ss); + if ( biotype <= 0 ) + { + if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored gene: %s\n",line); + return; + } + + aux_t *aux = &args->init; + + // substring search for "ID=gene:ENSG00000437963" + uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss); + gf_gene_t *gene = gene_init(aux, gene_id); + assert( !gene->name ); // the gene_id should be unique + + gene->iseq = feature_set_seq(args, chr_beg,chr_end); + + // substring search for "Name=OR4F5" + ss = strstr(chr_end+2,"Name="); + if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line); + ss += 5; + char *se = ss; + while ( *se && *se!=';' && !isspace(*se) ) se++; + gene->name = (char*) malloc(se-ss+1); + memcpy(gene->name,ss,se-ss); + gene->name[se-ss] = 0; +} +int gff_parse(args_t *args, char *line, ftr_t *ftr) +{ + // - skip empty lines and commented lines + // - columns + // 1. chr + // 2. + // 3. CDS, transcript, gene, ... + // 4-5. beg,end + // 6. + // 7. strand + // 8. phase + // 9. Parent=transcript:ENST(\d+);ID=... etc + + char *ss = line; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *chr_beg, *chr_end; + gff_parse_chr(line, &chr_beg, &chr_end); + ss = gff_skip(line, chr_end + 2); + + // 3. column: is this a CDS, transcript, gene, etc. + if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } + else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } + else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } + else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } + else + { + ss = gff_skip(line, ss); + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + ss = gff_skip(line, ss); + int type = gff_parse_type(ss); + if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) + { + // we ignore these, debug print to see new types: + ss = strstr(ss,"ID="); + if ( !ss ) return -1; // no ID, ignore the line + if ( !strncmp("chromosome",ss+3,10) ) return -1; + if ( !strncmp("supercontig",ss+3,11) ) return -1; + if ( args->quiet<2 ) fprintf(pysam_stderr,"ignored: %s\n", line); + return -1; + } + + // 7. column: strand + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + else error("Unknown strand: %c .. %s\n", *ss,ss); + + if ( type==GFF_TSCRIPT_LINE ) + gff_parse_transcript(args, line, ss, ftr); + else + gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr); + + return -1; + } + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + ss = gff_skip(line, ss); + + // 7. column: strand + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } + ss += 2; + + // 8. column: phase (codon offset) + if ( *ss == '0' ) ftr->phase = 0; + else if ( *ss == '1' ) ftr->phase = 1; + else if ( *ss == '2' ) ftr->phase = 2; + else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase + else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } + ss += 2; + + // substring search for "Parent=transcript:ENST00000437963" + ftr->trid = gff_parse_id(line, "Parent=transcript:", ss); + ftr->iseq = feature_set_seq(args, chr_beg,chr_end); + return 0; +} + +static int cmp_cds_ptr(const void *a, const void *b) +{ + // comparison function for qsort of transcripts's CDS + if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; + if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; + return 0; +} + +static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +{ + *chr_beg = *chr_end = aux->seq[iseq]; + while ( (*chr_end)[1] ) (*chr_end)++; +} +tscript_t *tscript_init(aux_t *aux, uint32_t trid) +{ + khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); + tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); + assert( tr ); + return tr; +} +void register_cds(args_t *args, ftr_t *ftr) +{ + // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. + // ftr is the result of parsing a gff CDS line + aux_t *aux = &args->init; + + tscript_t *tr = tscript_init(aux, ftr->trid); + if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); + + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); + cds->tr = tr; + cds->beg = ftr->beg; + cds->len = ftr->end - ftr->beg + 1; + cds->icds = 0; // to keep valgrind on mac happy + cds->phase = ftr->phase; + + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); + tr->cds[tr->ncds++] = cds; +} +void register_utr(args_t *args, ftr_t *ftr) +{ + aux_t *aux = &args->init; + gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); + utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; + utr->beg = ftr->beg; + utr->end = ftr->end; + utr->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); +} +void register_exon(args_t *args, ftr_t *ftr) +{ + aux_t *aux = &args->init; + gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); + exon->beg = ftr->beg; + exon->end = ftr->end; + exon->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); +} + +void tscript_init_cds(args_t *args) +{ + aux_t *aux = &args->init; + + // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) + khint_t k; + for (k=0; kid2tr); k++) + { + if ( !kh_exist(aux->id2tr, k) ) continue; + tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k); + + // position-to-tscript lookup + char *chr_beg, *chr_end; + chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); + + if ( !tr->ncds ) continue; // transcript with no CDS + + // sort CDs + qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); + + // trim non-coding start + int i, len = 0; + if ( tr->strand==STRAND_FWD ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + + // sanity check phase + for (i=0; incds; i++) + { + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3) + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + assert( phase == len%3 ); + len += tr->cds[i]->len; + } + } + else + { + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + tr->cds[i]->len -= tr->cds[i]->phase; + tr->cds[i]->phase = 0; + + // sanity check phase + for (i=tr->ncds-1; i>=0; i--) + { + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3) + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + len += tr->cds[i]->len; + } + } + + // set len. At the same check that CDS within a transcript do not overlap + len = 0; + for (i=0; incds; i++) + { + tr->cds[i]->icds = i; + len += tr->cds[i]->len; + if ( !i ) continue; + + gf_cds_t *a = tr->cds[i-1]; + gf_cds_t *b = tr->cds[i]; + if ( a->beg + a->len - 1 >= b->beg ) + error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n", + kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + } + if ( len%3 != 0 ) + { + // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 + // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 + // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. + + tr->trim |= TRIM_3PRIME; + if ( tr->strand==STRAND_FWD ) + { + i = tr->ncds - 1; + while ( i>=0 && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + len -= dlen; + i--; + } + } + else + { + i = 0; + while ( incds && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + tr->cds[i]->beg += dlen; + len -= dlen; + i++; + } + } + } + + // set CDS offsets and insert into regidx + len=0; + for (i=0; incds; i++) + { + tr->cds[i]->pos = len; + len += tr->cds[i]->len; + regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); + } + } +} + +void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } +void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); } + +void init_gff(args_t *args) +{ + aux_t *aux = &args->init; + aux->seq2int = khash_str2int_init(); // chrom's numeric id + aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene + aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t + args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); + aux->ignored_biotypes = khash_str2int_init(); + + // parse gff + kstring_t str = {0,0,0}; + htsFile *fp = hts_open(args->gff_fname,"r"); + if ( !fp ) error("Failed to read %s\n", args->gff_fname); + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); + int ret = gff_parse(args, str.s, aux->ftr + aux->nftr); + if ( !ret ) aux->nftr++; + } + free(str.s); + if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname); + + + // process gff information: connect CDS and exons to transcripts + args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); + args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); + args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); + args->itr = regitr_init(NULL); + + int i; + for (i=0; inftr; i++) + { + ftr_t *ftr = &aux->ftr[i]; + + // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? + khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); + if ( k==kh_end(aux->id2tr) ) continue; // no such transcript + + tscript_t *tr = kh_val(aux->id2tr,k); + if ( !tr->gene->name ) + { + // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript) + regidx_free_tscript(&tr); + kh_del(int2tscript, aux->id2tr,k); + continue; + } + + // populate regidx by category: + // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 + // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... + if ( ftr->type==GF_CDS ) register_cds(args, ftr); + else if ( ftr->type==GF_EXON ) register_exon(args, ftr); + else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); + else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); + else + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type)); + } + tscript_init_cds(args); + + if ( !args->quiet ) + { + fprintf(pysam_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + regidx_nregs(args->idx_tscript), + regidx_nregs(args->idx_exon), + regidx_nregs(args->idx_cds), + regidx_nregs(args->idx_utr)); + } + + free(aux->ftr); + khash_str2int_destroy_free(aux->seq2int); + // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); + kh_destroy(int2tscript,aux->id2tr); + free(aux->seq); + + if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) + { + khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; + fprintf(pysam_stderr,"Ignored the following biotypes:\n"); + for (i = kh_begin(ign); i < kh_end(ign); i++) + { + if ( !kh_exist(ign,i)) continue; + fprintf(pysam_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); + } + } + khash_str2int_destroy_free(aux->ignored_biotypes); +} + +void init_data(args_t *args) +{ + args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; + + if ( !args->quiet ) fprintf(pysam_stderr,"Parsing %s ...\n", args->gff_fname); + init_gff(args); + + args->rid = -1; + + if ( args->filter_str ) + args->filter = filter_init(args->hdr, args->filter_str); + + args->fai = fai_load(args->fa_fname); + if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); + + args->pos2vbuf = kh_init(pos2vbuf); + args->active_tr = khp_init(trhp); + args->hap = (hap_t*) calloc(1,sizeof(hap_t)); + + // init samples + if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT; + if ( args->sample_list && !strcmp("-",args->sample_list) ) + { + // ignore all samples + if ( args->output_type==FT_TAB_TEXT ) + { + // significant speedup for plain VCFs + bcf_hdr_set_samples(args->hdr,NULL,0); + } + args->phase = PHASE_DROP_GT; + } + else + args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT); + args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr); + + if ( args->output_type==FT_TAB_TEXT ) + { + args->out = args->output_fname ? fopen(args->output_fname,"w") : pysam_stdout; + if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); + + fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); + fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); + int i; + for (i=1; iargc; i++) + fprintf(args->out," %s",args->argv[i]); + fprintf(args->out,"\n"); + fprintf(args->out,"# LOG\t[2]Message\n"); + fprintf(args->out,"# CSQ"); i = 1; + fprintf(args->out,"\t[%d]Sample", ++i); + fprintf(args->out,"\t[%d]Haplotype", ++i); + fprintf(args->out,"\t[%d]Chromosome", ++i); + fprintf(args->out,"\t[%d]Position", ++i); + fprintf(args->out,"\t[%d]Consequence", ++i); + fprintf(args->out,"\n"); + } + else + { + args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); + bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); + bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); + if ( args->hdr_nsmpl ) + bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); + bcf_hdr_write(args->out_fh, args->hdr); + } + if ( !args->quiet ) fprintf(pysam_stderr,"Calling...\n"); +} + +void destroy_data(args_t *args) +{ + regidx_destroy(args->idx_cds); + regidx_destroy(args->idx_utr); + regidx_destroy(args->idx_exon); + regidx_destroy(args->idx_tscript); + regitr_destroy(args->itr); + + khint_t k,i,j; + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(args->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k); + free(gene->name); + free(gene); + } + kh_destroy(int2gene,args->init.gid2gene); + + if ( args->filter ) + filter_destroy(args->filter); + + khp_destroy(trhp,args->active_tr); + kh_destroy(pos2vbuf,args->pos2vbuf); + if ( args->smpl ) smpl_ilist_destroy(args->smpl); + int ret; + if ( args->out_fh ) + ret = hts_close(args->out_fh); + else + ret = fclose(args->out); + if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"pysam_stdout"); + for (i=0; ivcf_rbuf.m; i++) + { + vbuf_t *vbuf = args->vcf_buf[i]; + if ( !vbuf ) continue; + for (j=0; jm; j++) + { + if ( !vbuf->vrec[j] ) continue; + if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line); + free(vbuf->vrec[j]->smpl); + free(vbuf->vrec[j]->vcsq); + free(vbuf->vrec[j]); + } + free(vbuf->vrec); + free(vbuf); + } + free(args->vcf_buf); + free(args->rm_tr); + free(args->csq_buf); + free(args->hap->stack); + free(args->hap->sseq.s); + free(args->hap->tseq.s); + free(args->hap->tref.s); + free(args->hap); + fai_destroy(args->fai); + free(args->gt_arr); + free(args->str.s); + free(args->str2.s); + free(ENSID_FMT); +} + +/* + The splice_* functions are for consquences around splice sites: start,stop,splice_* + */ +#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely +#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region +#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed +#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq +typedef struct +{ + tscript_t *tr; + struct { + int32_t pos, rlen, alen; + char *ref, *alt; + bcf1_t *rec; + } vcf; + uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev) + check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon + check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon + check_donor:1, // as with check_acceptor + check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon + check_region_end:1, // + check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr + set_refalt:1; // set kref,kalt, if set, check also for synonymous events + uint32_t csq; + int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele + uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives + ref_end; // a more conservative csq (the first and last base in kref.s) + kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP +} +splice_t; +void splice_init(splice_t *splice, bcf1_t *rec) +{ + memset(splice,0,sizeof(*splice)); + splice->vcf.rec = rec; + splice->vcf.pos = rec->pos; + splice->vcf.rlen = rec->rlen; + splice->vcf.ref = rec->d.allele[0]; +} +static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) +{ + // len>0 .. beg is the first base, del filled from right + // len<0 .. beg is the last base, del filled from left + + int rlen, alen, rbeg, abeg; // first base to include (ref coordinates) + if ( len<0 ) + { + rlen = alen = -len; + rbeg = beg - rlen + 1; + int dlen = splice->vcf.alen - splice->vcf.rlen; + if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle + dlen += splice->ref_end - beg; + abeg = rbeg + dlen; + } + else + { + rbeg = abeg = beg; + rlen = alen = len; + // check for incomplete del as above?? + } + +#define XDBG 0 +#if XDBG +fprintf(pysam_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); +#endif + splice->kref.l = 0; + splice->kalt.l = 0; + + // add the part before vcf.ref, in the vcf.ref and after vcf.ref + int roff; // how many vcf.ref bases already used + if ( rbeg < splice->vcf.pos ) + { + assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD + kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); + roff = 0; + } + else + roff = rbeg - splice->vcf.pos; +#if XDBG +fprintf(pysam_stderr,"r1: %s roff=%d\n",splice->kref.s,roff); +#endif + + if ( roff < splice->vcf.rlen && splice->kref.l < rlen ) + { + int len = splice->vcf.rlen - roff; // len still available in vcf.ref + if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed + kputsn(splice->vcf.ref + roff, len, &splice->kref); + } +#if XDBG +fprintf(pysam_stderr,"r2: %s\n",splice->kref.s); +#endif + + uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele + if ( splice->kref.l < rlen ) + { + if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD) + rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end; + if ( splice->kref.l < rlen ) + kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); + } +#if XDBG +fprintf(pysam_stderr,"r3: %s\n",splice->kref.s); +#endif + + + int aoff; + if ( abeg < splice->vcf.pos ) + { + assert( splice->tr->beg <= abeg ); + kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); + aoff = 0; + } + else + aoff = abeg - splice->vcf.pos; +#if XDBG +fprintf(pysam_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff); +#endif + + if ( aoff < splice->vcf.alen && splice->kalt.l < alen ) + { + int len = splice->vcf.alen - aoff; // len still available in vcf.alt + if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed + kputsn(splice->vcf.alt + aoff, len, &splice->kalt); + aoff -= len; + } + if ( aoff < 0 ) aoff = 0; + else aoff--; +#if XDBG +fprintf(pysam_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); +#endif + + end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele + if ( splice->kalt.l < alen ) + { + if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long + alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end; + if ( alen > 0 && alen > splice->kalt.l ) + kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); + } +#if XDBG +fprintf(pysam_stderr,"a3: %s\n",splice->kalt.s); +fprintf(pysam_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); +#endif +} +void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); +static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid) +{ + while ( regitr_overlap(itr) ) + { + gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); + tscript_t *tr = utr->tr; + if ( tr->id != trid ) continue; + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); + return csq.type.type; + } + return 0; +} +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type) +{ +#if XDBG +fprintf(pysam_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); +#endif + if ( !type ) return; + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = type; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); +} +static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp + // before and after the inserted bases + if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] ) + { + splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; + } + else + { + if ( splice->tend ) splice->tend--; + splice->ref_beg = splice->vcf.pos; + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; + } +#if XDBG +fprintf(pysam_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +#endif + + int ret; + if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon + { + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr + { + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + if ( ret!=0 ) + { + regitr_destroy(itr); + return SPLICE_OUTSIDE; // overlaps utr + } + } + regitr_destroy(itr); + } + if ( !splice->check_region_end ) return SPLICE_OUTSIDE; + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_beg < ex_end + N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon + { + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr + { + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + if ( ret!=0 ) + { + regitr_destroy(itr); + return SPLICE_OUTSIDE; // overlaps utr + } + } + regitr_destroy(itr); + } + if ( !splice->check_region_beg ) return SPLICE_OUTSIDE; + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON); + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_end > ex_beg - N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + // overlaps the exon or inside the exon + // possible todo: find better alignment for frameshifting variants? + if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp + { + if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->ref_end > ex_end - 2 ) + { + if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->set_refalt ) + { + // Make sure the variant will not end up left aligned to avoid overlapping vcf records + // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1); + // splice->vcf.rlen -= splice->tbeg + splice->tend - 1; + // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } + if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right + { + int dlen = splice->vcf.pos - splice->ref_beg; + assert( dlen==1 ); + splice->tbeg += dlen; + if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen; + splice->ref_beg = splice->vcf.pos; + } + if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele + splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1); + splice->vcf.rlen -= splice->tbeg + splice->tend - 1; + if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_INSIDE; +} + +static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG + splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base + +#if XDBG +fprintf(pysam_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +#endif + + if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 + { + if ( splice->check_region_beg ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + // filling from the left does not work for ENST00000341065/frame3.vcf + // CAG.GTGGCCAG CAG.GTGGCCAG + // CA-.--GGCCAG vs CAG.---GCCAG + // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON); + // + // filling from the right: + splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON); + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + } + } + if ( splice->ref_end >= ex_beg ) + { + splice->tbeg = splice->ref_beg - splice->vcf.pos + 1; + splice->ref_beg = ex_beg - 1; + if ( splice->tbeg + splice->tend == splice->vcf.alen ) + { + // the deletion overlaps ex_beg and cannot be easily realigned to the right + if ( !splice->tend ) + { + splice->csq |= CSQ_CODING_SEQUENCE; + return SPLICE_OVERLAP; + } + splice->tend--; + } + } + } + if ( ex_end < splice->ref_end ) // the part after the exon + { + if ( splice->check_region_end ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + char *ref = NULL, *alt = NULL; + if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available + { + splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base + ref = splice->kref.s, alt = splice->kalt.s; + } + if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) + { + splice->csq |= CSQ_SPLICE_REGION; + if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + if ( splice->ref_beg < ex_end + N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + } + } + } + if ( splice->ref_beg < ex_end ) + { + splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); + splice->ref_end = ex_end; + } + } + if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end ) + { + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + + if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 + { + if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->ref_end > ex_end - 3 ) + { + if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->set_refalt ) + { + if ( splice->tbeg>0 ) splice->tbeg--; //why is this? + if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend ) + { + splice->vcf.rlen -= splice->tbeg + splice->tend; + splice->vcf.alen -= splice->tbeg + splice->tend; + } + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); + if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf + { + splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; + return SPLICE_OVERLAP; + } + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_INSIDE; +} + +static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + // not a real variant, can be ignored: eg ACGT>ACGT + if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF; + + splice->ref_beg = splice->vcf.pos + splice->tbeg; + splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; + +#if XDBG +fprintf(pysam_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +#endif + + if ( splice->ref_beg < ex_beg ) // the part before the exon + { + if ( splice->check_region_beg ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) + splice->csq |= CSQ_SPLICE_REGION; + if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + } + } + } + if ( splice->ref_end >= ex_beg ) + { + splice->tbeg = splice->ref_beg - splice->vcf.pos; + splice->ref_beg = ex_beg; + } + } + if ( ex_end < splice->ref_end ) // the part after the exon + { + if ( splice->check_region_end ) + { + int csq = 0; + if ( splice->check_utr ) + { + regitr_t *itr = regitr_init(NULL); + const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); + regitr_destroy(itr); + } + if ( !csq ) + { + if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) + splice->csq |= CSQ_SPLICE_REGION; + if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR ) + { + if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; + if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; + } + } + } + if ( splice->ref_beg <= ex_end ) + { + splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); + splice->ref_end = ex_end; + } + } + if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end ) + { + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_OUTSIDE; + } + + if ( splice->ref_beg < ex_beg + 3 ) + { + if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->ref_end > ex_end - 3 ) + { + if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; + if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } + else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } + } + if ( splice->set_refalt ) + { + splice->vcf.rlen -= splice->tbeg + splice->tend; + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); + } + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + return SPLICE_INSIDE; +} +static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +{ + splice->csq = 0; + splice->vcf.alen = strlen(splice->vcf.alt); + + int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; + splice->tbeg = 0, splice->tend = 0; + + // trim from the right, then from the left + while ( i<=rlen1 && i<=alen1 ) + { + if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break; + i++; + } + splice->tend = i; + rlen1 -= i, alen1 -= i, i = 0; + while ( i<=rlen1 && i<=alen1 ) + { + if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break; + i++; + } + splice->tbeg = i; + + // The mnp, ins and del code was split into near-identical functions for clarity and debugging; + // possible todo: generalize once stable + if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end); + if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end); + if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end); + + return 0; +} + +// return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref) +int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) +{ + int i; + kstring_t str = {0,0,0}; + tscript_t *tr = cds->tr; + child->icds = cds->icds; // index of cds in the tscript's list of exons + + splice_t splice; + splice_init(&splice, rec); + splice.tr = tr; + splice.vcf.alt = rec->d.allele[ial]; + splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1; + if ( !(tr->trim & TRIM_5PRIME) ) + { + if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; } + else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; } + } + if ( !(tr->trim & TRIM_3PRIME) ) + { + if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; } + else { if ( child->icds==0 ) splice.check_stop = 1; } + } + if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M + { + if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + } + if ( child->icds!=0 ) splice.check_region_beg = 1; + if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; + +#if XDBG +fprintf(pysam_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); +#endif + int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); +#if XDBG +fprintf(pysam_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq); +#endif + + if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA + if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq + { + free(splice.kref.s); + free(splice.kalt.s); + + if ( !splice.csq ) return 2; // fully intronic, no csq + + // splice_region/acceptor/donor + child->seq = NULL; + child->sbeg = 0; + child->rbeg = rec->pos; + child->rlen = 0; + child->dlen = 0; + kputs(rec->d.allele[0],&str); + kputc('>',&str); + kputs(rec->d.allele[ial],&str); + child->var = str.s; + child->type = HAP_SSS; + child->csq = splice.csq; + child->prev = parent->type==HAP_SSS ? parent->prev : parent; + child->rec = rec; + return 0; + } + if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice + + int dbeg = 0; + if ( splice.ref_beg < cds->beg ) + { + // The vcf record overlaps the exon boundary, but the variant itself + // should fit inside since we are here. This will need more work. + // #1475227917 + dbeg = cds->beg - splice.ref_beg; + splice.kref.l -= dbeg; + splice.ref_beg = cds->beg; + assert( dbeg <= splice.kalt.l ); + } + + if ( parent->type==HAP_SSS ) parent = parent->prev; + if ( parent->type==HAP_CDS ) + { + i = parent->icds; + if ( i!=cds->icds ) + { + // the variant is on a new exon, finish up the previous + int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg; + if ( len > 0 ) + kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + } + + // append any skipped non-variant exons + while ( ++i < cds->icds ) + kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); + + if ( parent->icds==child->icds ) + { + int len = splice.ref_beg - parent->rbeg - parent->rlen; + if ( len < 0 ) // overlapping variants + { + free(str.s); + return 1; + } + kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + } + else + kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); + } + kputs(splice.kalt.s + dbeg, &str); + + child->seq = str.s; + child->sbeg = cds->pos + (splice.ref_beg - cds->beg); + child->rbeg = splice.ref_beg; + child->rlen = splice.kref.l; + child->type = HAP_CDS; + child->prev = parent; + child->rec = rec; + child->csq = splice.csq; + + // set vlen and the "ref>alt" string + { + int rlen = strlen(rec->d.allele[0]); + int alen = strlen(rec->d.allele[ial]); + child->dlen = alen - rlen; + child->var = (char*) malloc(rlen+alen+2); + memcpy(child->var,rec->d.allele[0],rlen); + child->var[rlen] = '>'; + memcpy(child->var+rlen+1,rec->d.allele[ial],alen); + child->var[rlen+alen+1] = 0; + } + + // yuck, the whole CDS is modified/deleted, not ready for this, todo. + if ( child->rbeg + child->rlen > cds->beg + cds->len ) + { + child->type = HAP_SSS; + if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf + } + + free(splice.kref.s); + free(splice.kalt.s); + return 0; +} +void hap_destroy(hap_node_t *hap) +{ + int i; + for (i=0; inchild; i++) + if ( hap->child[i] ) hap_destroy(hap->child[i]); + for (i=0; imcsq_list; i++) free(hap->csq_list[i].type.vstr.s); + free(hap->csq_list); + free(hap->child); + free(hap->cur_child); + free(hap->seq); + free(hap->var); + free(hap); +} + + +/* + ref: spliced reference and its length (ref.l) + seq: part of the spliced query transcript on the reference strand to translate, its + length (seq.l) and the total length of the complete transcript (seq.m) + sbeg: seq offset within the spliced query transcript + rbeg: seq offset within ref, 0-based + rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l + strand: coding strand - 0:rev, 1:fwd + tseq: translated sequence (aa) + fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev) + */ +void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) +{ +#if XDBG +fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); +#endif + char tmp[3], *codon, *end; + int i, len, npad; + + kstring_t ref = *_ref; + kstring_t seq = *_seq; + + tseq->l = 0; + if ( !seq.l ) + { + kputc('?', tseq); + return; + } + +#define DBG 0 +#if DBG + fprintf(pysam_stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); + fprintf(pysam_stderr," ref: l=%d %s\n", (int)ref.l,ref.s); + fprintf(pysam_stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m); + for (i=0; i1 + fprintf(pysam_stderr," npad: %d\n",npad); +#endif + assert( npad<=rbeg ); + + for (i=0; i1 + fprintf(pysam_stderr,"\t i=%d\n", i); +#endif + if ( i==3 ) + { + kputc_(dna2aa(tmp), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); +#endif + codon = seq.s + 3 - npad; // next codon + end = codon + len - 1 - (len % 3); // last position of a valid codon + while ( codon < end ) + { + kputc_(dna2aa(codon), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); +#endif + codon += 3; + } + end = seq.s + seq.l - 1; + for (i=0; codon+i<=end; i++) tmp[i] = codon[i]; + } + + // right padding + codon = ref.s + rend + N_REF_PAD; + if ( i>0 ) + { +#if DBG>1 + if(i==1)fprintf(pysam_stderr,"[3]%c\n",tmp[0]); + if(i==2)fprintf(pysam_stderr,"[3]%c%c\n",tmp[0],tmp[1]); +#endif + for (; i<3; i++) + { + tmp[i] = *codon; + codon++; + } + kputc_(dna2aa(tmp), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); +#endif + } + if ( fill!=0 ) + { + end = ref.s + ref.l - N_REF_PAD; + while ( codon+3 <= end ) + { + kputc_(dna2aa(codon), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); +#endif + codon += 3; + } + } + } + else // STRAND_REV + { + // right padding - number of bases to take from ref + npad = (seq.m - (sbeg + seq.l)) % 3; +#if DBG>1 + fprintf(pysam_stderr," npad: %d\n",npad); +#endif +if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); + assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand + + if ( npad==2 ) + { + tmp[1] = ref.s[rend+N_REF_PAD]; + tmp[2] = ref.s[rend+N_REF_PAD+1]; + i = 0; + } + else if ( npad==1 ) + { + tmp[2] = ref.s[rend+N_REF_PAD]; + i = 1; + } + else + i = 2; + + end = seq.s + seq.l; + for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); +#if DBG>1 + fprintf(pysam_stderr,"\t i=%d\n", i); + if(i==1)fprintf(pysam_stderr,"[0] %c\n",tmp[2]); + if(i==0)fprintf(pysam_stderr,"[0] %c%c\n",tmp[1],tmp[2]); +#endif + if ( i==-1 ) + { +#if DBG>1 + fprintf(pysam_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); +#endif + kputc_(cdna2aa(tmp), tseq); + codon = end - 3; + while ( codon >= seq.s ) + { + kputc_(cdna2aa(codon), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); +#endif + codon -= 3; + } + if ( seq.s-codon==2 ) + { + tmp[2] = seq.s[0]; + i = 1; + } + else if ( seq.s-codon==1 ) + { + tmp[1] = seq.s[0]; + tmp[2] = seq.s[1]; + i = 0; + } + else + i = -1; +#if DBG>1 + if(i==1)fprintf(pysam_stderr,"[3] %c\n",tmp[2]); + if(i==0)fprintf(pysam_stderr,"[3] %c%c\n",tmp[1],tmp[2]); +#endif + } + // left padding + end = ref.s + N_REF_PAD + rbeg; + if ( i>=0 ) + { + for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end); + kputc_(cdna2aa(tmp), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); +#endif + } + if ( fill!=0 ) + { + codon = end - 3; + while ( codon >= ref.s + N_REF_PAD ) + { + kputc_(cdna2aa(codon), tseq); +#if DBG>1 + fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); +#endif + codon -= 3; + } + } + } + kputc_(0,tseq); tseq->l--; +#if DBG + fprintf(pysam_stderr," tseq: %s\n", tseq->s); +#endif +} + +void tscript_splice_ref(tscript_t *tr) +{ + int i, len = 0; + for (i=0; incds; i++) + len += tr->cds[i]->len; + + tr->nsref = len + 2*N_REF_PAD; + tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); + len = 0; + + memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); + len += N_REF_PAD; + + for (i=0; incds; i++) + { + memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); + len += tr->cds[i]->len; + } + memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); + len += N_REF_PAD; + + tr->sref[len] = 0; +} + +// returns: 0 if consequence was added, 1 if it already exists or could not be added +int csq_push(args_t *args, csq_t *csq, bcf1_t *rec) +{ +#if XDBG +fprintf(pysam_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); +#endif + khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos); + vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k); + if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr); + + int i; + for (i=0; in; i++) + if ( vbuf->vrec[i]->line==rec ) break; + if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr); + vrec_t *vrec = vbuf->vrec[i]; + + // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor + if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) + csq->type.type &= ~CSQ_SPLICE_REGION; + + if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) + { + for (i=0; invcsq; i++) + { + // Same as below, to avoid records like + // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|- + // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C + if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) + { + vrec->vcsq[i] = csq->type; + goto exit_duplicate; + } + if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue; + if ( csq->type.ref != vrec->vcsq[i].ref ) continue; + goto exit_duplicate; + } + } + else if ( csq->type.type & CSQ_COMPOUND ) + { + for (i=0; invcsq; i++) + { + if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue; + if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; + if ( csq->type.gene != vrec->vcsq[i].gene ) continue; + if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) + { + // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function + // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered + // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two + // consequences: + // stop_lost|AL627309.1|ENST00000423372|protein_coding|- + // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA + if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) + { + if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) + { + vrec->vcsq[i].type |= csq->type.type; + + // remove stop_lost&synonymous if stop_retained set + if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) + vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); + + if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; + goto exit_duplicate; + } + continue; + } + if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue; + } + vrec->vcsq[i].type |= csq->type.type; + goto exit_duplicate; + } + } + else + { + for (i=0; invcsq; i++) + { + if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue; + if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; + if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) + { + vrec->vcsq[i].type |= csq->type.type; + goto exit_duplicate; + } + if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate; + } + } + // no such csq yet in this vcf record + csq->vrec = vrec; + csq->idx = i; + vrec->nvcsq++; + hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq); + vrec->vcsq[i] = csq->type; + return 0; + +exit_duplicate: + csq->vrec = vrec; + csq->idx = i; + return 1; +} + +// soff .. position of the variant within the trimmed query transcript +// sbeg .. position of the variant within the query transcript +// rbeg .. position on the reference transcript (if there are no indels, then rbeg=send) +// rpos .. VCF position +#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen)) +#define node2sbeg(i) (hap->sbeg + node2soff(i)) +#define node2send(i) (hap->sbeg + hap->stack[i].slen) +#define node2rbeg(i) (hap->stack[i].node->sbeg) +#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen) +#define node2rpos(i) (hap->stack[i].node->rec->pos) + +void kput_vcsq(vcsq_t *csq, kstring_t *str) +{ + // Remove start/stop from incomplete CDS, but only if there is another + // consequence as something must be reported + if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS); + + // Remove missense from start/stops + if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT; + + if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref ) + { + kputc_('@',str); + kputw(csq->ref->pos+1, str); + return; + } + if ( csq->type & CSQ_UPSTREAM_STOP ) + kputc_('*',str); + + int i, n = sizeof(csq_strings)/sizeof(char*); + for (i=1; itype&(1<type&(1<gene ) kputs(csq->gene , str); + + kputc_('|', str); + if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid)); + + kputc_('|', str); + kputs(gf_type2gff_string(csq->biotype), str); + + if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l ) + kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str); + + if ( csq->vstr.l ) + kputs(csq->vstr.s, str); +} + +void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) +{ + int i; + tscript_t *tr = hap->tr; + int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; + + int icsq = node->ncsq_list++; + hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); + csq_t *csq = &node->csq_list[icsq]; + csq->pos = hap->stack[ref_node].node->rec->pos; + csq->type.trid = tr->id; + csq->type.gene = tr->gene->name; + csq->type.strand = tr->strand; + csq->type.biotype = tr->type; + + // only now we see the translated sequence and can determine if the stop/start changes are real + int rm_csq = 0; + csq->type.type = 0; + for (i=ibeg; i<=iend; i++) + csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND; + if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING; + + int has_upstream_stop = hap->upstream_stop; + if ( hap->stack[ibeg].node->type != HAP_SSS ) + { + // check for truncating stops + for (i=0; itref.l; i++) + if ( hap->tref.s[i]=='*' ) break; + if ( i!=hap->tref.l ) + { + hap->tref.l = i+1; + hap->tref.s[i+1] = 0; + } + for (i=0; itseq.l; i++) + if ( hap->tseq.s[i]=='*' ) break; + if ( i!=hap->tseq.l ) + { + hap->tseq.l = i+1; + hap->tseq.s[i+1] = 0; + hap->upstream_stop = 1; + } + if ( csq->type.type & CSQ_STOP_LOST ) + { + if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) + { + rm_csq |= CSQ_STOP_LOST; + csq->type.type |= CSQ_STOP_RETAINED; + } + else if ( hap->tref.s[hap->tref.l-1]!='*' ) + { + // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense + // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf + if ( hap->tseq.s[hap->tseq.l-1] == '*' ) + { + rm_csq |= CSQ_STOP_GAINED; + csq->type.type |= CSQ_STOP_RETAINED; + } + else + csq->type.type |= CSQ_INCOMPLETE_CDS; + } + } + if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' ) + { + rm_csq |= CSQ_START_LOST; + csq->type.type &= ~CSQ_START_LOST; + } + if ( dlen!=0 ) + { + if ( dlen%3 ) + csq->type.type |= CSQ_FRAMESHIFT_VARIANT; + else if ( dlen<0 ) + csq->type.type |= CSQ_INFRAME_DELETION; + else + csq->type.type |= CSQ_INFRAME_INSERTION; + } + else + { + for (i=0; itref.l; i++) + if ( hap->tref.s[i] != hap->tseq.s[i] ) break; + if ( i==hap->tref.l ) + csq->type.type |= CSQ_SYNONYMOUS_VARIANT; + else if ( hap->tref.s[i] == '*' ) + csq->type.type |= CSQ_STOP_LOST; + else if ( hap->tseq.s[i] == '*' ) + csq->type.type |= CSQ_STOP_GAINED; + else + csq->type.type |= CSQ_MISSENSE_VARIANT; + } + } + if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; + csq->type.type &= ~rm_csq; + + if ( hap->stack[ibeg].node->type == HAP_SSS ) + { + node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq; + node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec; + node->csq_list[icsq].type.biotype = tr->type; + csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec); + return; + } + + kstring_t str = node->csq_list[icsq].type.vstr; + str.l = 0; + + // create the aa variant string + int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; + kputc_('|', &str); + kputw(aa_rbeg, &str); + kputs(hap->tref.s, &str); + if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) + { + kputc_('>', &str); + kputw(aa_sbeg, &str); + kputs(hap->tseq.s, &str); + } + kputc_('|', &str); + + // create the dna variant string and, in case of combined variants, + // insert silent CSQ_PRINTED_UPSTREAM variants + for (i=ibeg; i<=iend; i++) + { + if ( i>ibeg ) kputc_('+', &str); + kputw(node2rpos(i)+1, &str); + kputs(hap->stack[i].node->var, &str); + } + node->csq_list[icsq].type.vstr = str; + csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec); + + for (i=ibeg; i<=iend; i++) + { + // csq are printed at one position only for combined variants, the rest is + // silent and references the first + if ( hap->stack[i].node->csq & ~CSQ_COMPOUND ) + { + node->ncsq_list++; + hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); + csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; + tmp_csq->pos = hap->stack[i].node->rec->pos; + tmp_csq->type.trid = tr->id; + tmp_csq->type.gene = tr->gene->name; + tmp_csq->type.strand = tr->strand; + tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq; + tmp_csq->type.biotype = tr->type; + tmp_csq->type.vstr.l = 0; + kputs(str.s,&tmp_csq->type.vstr); + csq_push(args, tmp_csq, hap->stack[i].node->rec); + } + if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) ) + { + node->ncsq_list++; + hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); + csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; + tmp_csq->pos = hap->stack[i].node->rec->pos; + tmp_csq->type.trid = tr->id; + tmp_csq->type.gene = tr->gene->name; + tmp_csq->type.strand = tr->strand; + tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq; + tmp_csq->type.biotype = tr->type; + tmp_csq->type.ref = hap->stack[ref_node].node->rec; + tmp_csq->type.vstr.l = 0; + csq_push(args, tmp_csq, hap->stack[i].node->rec); + } + } +} + +void hap_finalize(args_t *args, hap_t *hap) +{ + tscript_t *tr = hap->tr; + if ( !tr->sref ) + tscript_splice_ref(tr); + + kstring_t sref; + sref.s = tr->sref; + sref.l = tr->nsref; + sref.m = sref.l; + + int istack = 0; + hts_expand(hstack_t,1,hap->mstack,hap->stack); + + hap->sseq.l = 0; + hap->tseq.l = 0; + hap->stack[0].node = tr->root; + hap->stack[0].ichild = -1; + hap->stack[0].slen = 0; + hap->stack[0].dlen = 0; + + while ( istack>=0 ) + { + hstack_t *stack = &hap->stack[istack]; + hap_node_t *node = hap->stack[istack].node; + while ( ++hap->stack[istack].ichild < node->nchild ) + { + if ( node->child[stack->ichild] ) break; + } + if ( stack->ichild == node->nchild ) { istack--; continue; } + + node = node->child[stack->ichild]; + + istack++; + hts_expand(hstack_t,istack+1,hap->mstack,hap->stack); + stack = &hap->stack[istack-1]; + + hap->stack[istack].node = node; + hap->stack[istack].ichild = -1; + + hap->sseq.l = stack->slen; + if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq); + hap->stack[istack].slen = hap->sseq.l; + hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen; + + if ( !node->nend ) continue; // not a leaf node + + // The spliced sequence has been built for the current haplotype and stored + // in hap->sseq. Now we break it and output as independent parts + + kstring_t sseq; + sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript + hap->upstream_stop = 0; + + int i = 1, dlen = 0, ibeg, indel = 0; + while ( istack[i].node->type == HAP_SSS ) i++; + hap->sbeg = hap->stack[i].node->sbeg; + + if ( tr->strand==STRAND_FWD ) + { + i = 0, ibeg = -1; + while ( ++i <= istack ) + { + if ( hap->stack[i].node->type == HAP_SSS ) + { + // start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping + hap_add_csq(args,hap,node,0,i,i,0,0); + continue; + } + dlen += hap->stack[i].node->dlen; + if ( hap->stack[i].node->dlen ) indel = 1; + if ( isseq.l ) + { + sseq.l = hap->stack[i].slen - ioff; + sseq.s = hap->sseq.s + ioff; + } + else // splice site overlap, see #1475227917 + sseq.l = fill = 0; + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + + // ref + sseq.l = node2rend(i) - rbeg; + sseq.s = sref.s + N_REF_PAD + rbeg; + sseq.m = sref.m - 2*N_REF_PAD; + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; + + hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel); + ibeg = -1; + dlen = 0; + indel = 0; + } + } + else + { + i = istack + 1, ibeg = -1; + while ( --i > 0 ) + { + if ( hap->stack[i].node->type == HAP_SSS ) + { + hap_add_csq(args,hap,node,0,i,i,0,0); + continue; + } + dlen += hap->stack[i].node->dlen; + if ( hap->stack[i].node->dlen ) indel = 1; + if ( i>1 && hap->stack[i-1].node->type != HAP_SSS ) + { + if ( dlen%3 ) + { + if ( ibeg==-1 ) ibeg = i; + continue; + } + int icur = sseq.m - 1 - node2sbeg(i); + int inext = sseq.m - 1 - node2sbeg(i-1); + if ( icur/3 == inext/3 ) + { + if ( ibeg==-1 ) ibeg = i; + continue; + } + } + if ( ibeg<0 ) ibeg = i; + int ioff = node2soff(i); + int icur = node2sbeg(i); + int rbeg = node2rbeg(i); + int rend = node2rend(ibeg); + int fill = dlen%3; + + // alt + if ( hap->sseq.l ) + { + sseq.l = hap->stack[ibeg].slen - ioff; + sseq.s = hap->sseq.s + ioff; + } + else // splice site overlap, see #1475227917 + sseq.l = fill = 0; + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + + // ref + sseq.l = node2rend(ibeg) - rbeg; + sseq.s = sref.s + N_REF_PAD + rbeg; + sseq.m = sref.m - 2*N_REF_PAD; + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; + + hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel); + ibeg = -1; + dlen = 0; + indel = 0; + } + } + } +} + +static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) +{ + if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return; + + char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-"; + const char *chr = bcf_hdr_id2name(args->hdr,args->rid); + + fprintf(args->out,"CSQ\t%s\t", smpl); + if ( ihap>0 ) + fprintf(args->out,"%d", ihap); + else + fprintf(args->out,"-"); + + args->str.l = 0; + kput_vcsq(&csq->type, &args->str); + fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); +} +static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +{ + if ( !node || !node->ncsq_list ) return; + + char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-"; + const char *chr = bcf_hdr_id2name(args->hdr,args->rid); + + int i; + for (i=0; incsq_list; i++) + { + csq_t *csq = node->csq_list + i; + if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue; + assert( csq->type.vstr.l ); + + fprintf(args->out,"CSQ\t%s\t", smpl); + if ( ihap>0 ) + fprintf(args->out,"%d", ihap); + else + fprintf(args->out,"-"); + + args->str.l = 0; + kput_vcsq(&csq->type, &args->str); + fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); + } +} + +static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +{ + if ( !node || !node->ncsq_list || ismpl<0 ) return; + + int i; + for (i=0; incsq_list; i++) + { + csq_t *csq = node->csq_list + i; + vrec_t *vrec = csq->vrec; + int icsq = 2*csq->idx + ihap; + if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + { + int print_warning = 1; + if ( args->quiet ) + { + if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; + args->ncsq_small_warned = 1; + } + if ( print_warning ) + { + fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", + args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); + if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n"); + } + break; + } + if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; + vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + } +} + +void hap_flush(args_t *args, uint32_t pos) +{ + int i,j; + tr_heap_t *heap = args->active_tr; + + while ( heap->ndat && heap->dat[0]->end<=pos ) + { + tscript_t *tr = heap->dat[0]; + khp_delete(trhp, heap); + + args->hap->tr = tr; + if ( tr->root && tr->root->nchild ) // normal, non-localized calling + { + hap_finalize(args, args->hap); + + if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf + { + if ( args->phase==PHASE_DROP_GT ) + hap_print_text(args, tr, -1,0, tr->hap[0]); + else + { + for (i=0; ismpl->n; i++) + { + for (j=0; j<2; j++) + hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]); + } + } + } + else if ( args->phase!=PHASE_DROP_GT ) + { + for (i=0; ismpl->n; i++) + { + for (j=0; j<2; j++) + hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]); + } + } + } + + // mark the transcript for deletion. Cannot delete it immediately because + // by-position VCF output will need them when flushed by vcf_buf_push + args->nrm_tr++; + hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); + args->rm_tr[args->nrm_tr-1] = tr; + } +} + +#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } + +void vbuf_push(args_t *args, bcf1_t **rec_ptr) +{ + int i; + + assert(rec_ptr); + bcf1_t *rec = *rec_ptr; + + // check for duplicate records + i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1; + if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) + { + // vcf record with a new pos + rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf); + i = rbuf_append(&args->vcf_rbuf); + if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); + args->vcf_buf[i]->n = 0; + } + vbuf_t *vbuf = args->vcf_buf[i]; + vbuf->n++; + hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec); + if ( !vbuf->vrec[vbuf->n - 1] ) + vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t)); + + vrec_t *vrec = vbuf->vrec[vbuf->n - 1]; + if ( args->phase!=PHASE_DROP_GT && args->smpl->n ) + { + if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq); + else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq); + } + if ( !vrec->line ) vrec->line = bcf_init1(); + SWAP(bcf1_t*, (*rec_ptr), vrec->line); + + int ret; + khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); + kh_val(args->pos2vbuf,k) = vbuf; +} + +void vbuf_flush(args_t *args) +{ + if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone + + int i,j; + while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 ) + { + vbuf_t *vbuf = args->vcf_buf[i]; + for (i=0; in; i++) + { + vrec_t *vrec = vbuf->vrec[i]; + if ( !args->out_fh ) // not a VCF output + { + vrec->nvcsq = 0; + continue; + } + if ( !vrec->nvcsq ) + { + bcf_write(args->out_fh, args->hdr, vrec->line); + continue; + } + + args->str.l = 0; + kput_vcsq(&vrec->vcsq[0], &args->str); + for (j=1; jnvcsq; j++) + { + kputc_(',', &args->str); + kput_vcsq(&vrec->vcsq[j], &args->str); + } + bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s); + if ( args->hdr_nsmpl ) + { + if ( vrec->nfmt < args->nfmt_bcsq ) + for (j=1; jhdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl)); + bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); + } + vrec->nvcsq = 0; + bcf_write(args->out_fh, args->hdr, vrec->line); + } + if ( vbuf->n ) + { + khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos); + if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); + } + vbuf->n = 0; + } + + for (i=0; inrm_tr; i++) + { + tscript_t *tr = args->rm_tr[i]; + if ( tr->root ) hap_destroy(tr->root); + tr->root = NULL; + free(tr->hap); + free(tr->ref); + free(tr->sref); + } + args->nrm_tr = 0; + args->ncsq_buf = 0; +} + +void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) +{ + int i, len; + int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; + + tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + if ( !tr->ref ) + error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); + + int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); + if ( pad_beg + pad_end != 2*N_REF_PAD ) + { + char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD); + for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; + memcpy(ref+i, tr->ref, len); + for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; + free(tr->ref); + tr->ref = ref; + } +} + +static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) +{ + char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0); + char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos); + assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) ); + while ( *ref && *vcf ) + { + if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) ) + error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]); + ref++; + vcf++; + } +} + +int test_cds_local(args_t *args, bcf1_t *rec) +{ + int i,j, ret = 0; + const char *chr = bcf_seqname(args->hdr,rec); + // note that the off-by-one extension of rlen is deliberate to account for insertions + if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + + // structures to fake the normal test_cds machinery + hap_node_t root, node; + root.type = HAP_ROOT; + kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq; + + while ( regitr_overlap(args->itr) ) + { + gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); + tscript_t *tr = cds->tr; + if ( !GF_is_coding(tr->type) ) continue; + ret = 1; + + if ( !tr->ref ) + { + tscript_init_ref(args, tr, chr); + tscript_splice_ref(tr); + khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards + } + + sanity_check_ref(args, tr, rec); + + kstring_t sref; + sref.s = tr->sref; + sref.l = tr->nsref; + sref.m = sref.l; + + for (i=1; in_allele; i++) + { + if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; + + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + + int csq_type = node.csq; + + // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though + if ( node.type == HAP_SSS ) + { + csq.type.type = csq_type; + csq_stage(args, &csq, rec); + } + else + { + kstring_t sseq; + sseq.m = sref.m - 2*N_REF_PAD + node.dlen; + sseq.s = node.seq; + int alen = sseq.l = strlen(sseq.s); + int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917 + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill); + + sseq.m = sref.m - 2*N_REF_PAD; + sseq.s = sref.s + N_REF_PAD + node.sbeg; + sseq.l = node.rlen; + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill); + + // check for truncating stops + for (j=0; jl; j++) + if ( tref->s[j]=='*' ) break; + if ( j!=tref->l ) + { + tref->l = j+1; + tref->s[j+1] = 0; + } + for (j=0; jl; j++) + if ( tseq->s[j]=='*' ) break; + if ( j!=tseq->l ) + { + tseq->l = j+1; + tseq->s[j+1] = 0; + } + if ( csq_type & CSQ_STOP_LOST ) + { + if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) + { + csq_type &= ~CSQ_STOP_LOST; + csq_type |= CSQ_STOP_RETAINED; + } + else if (tref->s[tref->l-1]!='*' ) + { + // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense + // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf + if ( tseq->s[tseq->l-1] == '*' ) + { + csq_type &= ~CSQ_STOP_GAINED; + csq_type |= CSQ_STOP_RETAINED; + } + else + csq_type |= CSQ_INCOMPLETE_CDS; + } + } + if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' ) + csq_type &= ~CSQ_START_LOST; + if ( node.dlen!=0 ) + { + if ( node.dlen%3 ) + csq_type |= CSQ_FRAMESHIFT_VARIANT; + else if ( node.dlen<0 ) + csq_type |= CSQ_INFRAME_DELETION; + else + csq_type |= CSQ_INFRAME_INSERTION; + } + else + { + for (j=0; jl; j++) + if ( tref->s[j] != tseq->s[j] ) break; + if ( j==tref->l ) + csq_type |= CSQ_SYNONYMOUS_VARIANT; + else if ( tref->s[j] == '*' ) + csq_type |= CSQ_STOP_LOST; + else if ( tseq->s[j] == '*' ) + csq_type |= CSQ_STOP_GAINED; + else + csq_type |= CSQ_MISSENSE_VARIANT; + } + if ( csq_type & CSQ_COMPOUND ) + { + // create the aa variant string + kstring_t str = {0,0,0}; + int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; + kputc_('|', &str); + kputw(aa_rbeg, &str); + kputs(tref->s, &str); + if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) + { + kputc_('>', &str); + kputw(aa_sbeg, &str); + kputs(tseq->s, &str); + } + kputc_('|', &str); + kputw(rec->pos+1, &str); + kputs(node.var, &str); + csq.type.vstr = str; + csq.type.type = csq_type & CSQ_COMPOUND; + csq_stage(args, &csq, rec); + + // all this only to clean vstr when vrec is flushed + if ( !tr->root ) + tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + tr->root->ncsq_list++; + hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); + csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; + rm_csq->type.vstr = str; + } + if ( csq_type & ~CSQ_COMPOUND ) + { + csq.type.type = csq_type & ~CSQ_COMPOUND; + csq.type.vstr.l = 0; + csq_stage(args, &csq, rec); + } + } + free(node.seq); + free(node.var); + } + } + return ret; +} + +int test_cds(args_t *args, bcf1_t *rec) +{ + int i, ret = 0, hap_ret; + const char *chr = bcf_seqname(args->hdr,rec); + // note that the off-by-one extension of rlen is deliberate to account for insertions + if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + while ( regitr_overlap(args->itr) ) + { + gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); + tscript_t *tr = cds->tr; + if ( !GF_is_coding(tr->type) ) continue; + ret = 1; + if ( !tr->root ) + { + // initialize the transcript and its haplotype tree, fetch the reference sequence + tscript_init_ref(args, tr, chr); + + tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid + tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*)); + for (i=0; inhap; i++) tr->hap[i] = NULL; + tr->root->nend = tr->nhap; + tr->root->type = HAP_ROOT; + + khp_insert(trhp, args->active_tr, &tr); + } + + sanity_check_ref(args, tr, rec); + + if ( args->phase==PHASE_DROP_GT ) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root; + hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); + if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 ) + { + // overlapping or intron variant, cannot apply + if ( hap_ret==1 ) + { + if ( !args->quiet ) + fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + if ( args->out ) + fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + } + else ret = 1; // prevent reporting as intron in test_tscript + free(child); + continue; + } + parent->nend--; + parent->nchild = 1; + parent->mchild = 1; + parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*)); + parent->child[0] = child; + tr->hap[0] = child; + tr->hap[0]->nend = 1; + continue; + } + + // apply the VCF variants and extend the haplotype tree + int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); + ngts /= bcf_hdr_nsamples(args->hdr); + if ( ngts!=1 && ngts!=2 ) + { + if ( !args->quiet ) + fprintf(pysam_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + if ( args->out ) + fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + continue; + } + for (ismpl=0; ismplsmpl->n; ismpl++) + { + int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts; + if ( gt[0]==bcf_gt_missing ) continue; + + if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end ) + { + if ( args->phase==PHASE_MERGE ) + { + if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1]; + } + if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) + { + if ( args->phase==PHASE_REQUIRE ) + error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); + if ( args->phase==PHASE_SKIP ) + continue; + if ( args->phase==PHASE_NON_REF ) + { + if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1]; + else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0]; + } + } + } + + for (ihap=0; ihapn_allele ); + if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; } + + hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root; + if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 ) + { + // this haplotype has been seen in another sample + tr->hap[i] = parent->child[ parent->cur_child[ial] ]; + tr->hap[i]->nend++; + parent->nend--; + continue; + } + + hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); + if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 ) + { + // overlapping or intron variant, cannot apply + if ( hap_ret==1 ) + { + if ( !args->quiet ) + fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", + chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + if ( args->out ) + fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", + chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + } + free(child); + continue; + } + + if ( parent->cur_rec!=rec ) + { + hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child); + for (j=0; jn_allele; j++) parent->cur_child[j] = -1; + parent->cur_rec = rec; + } + + j = parent->nchild++; + hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child); + parent->cur_child[ial] = j; + parent->child[j] = child; + tr->hap[i] = child; + tr->hap[i]->nend++; + parent->nend--; + } + } + } + return ret; +} + +void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) +{ + // known issues: tab output leads to unsorted output. This is because + // coding haplotypes are printed in one go and buffering is not used + // with tab output. VCF output is OK though. + if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists + + int i,j,ngt = 0; + if ( args->phase!=PHASE_DROP_GT ) + { + ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); + if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr); + } + if ( ngt<=0 ) + { + if ( args->output_type==FT_TAB_TEXT ) + csq_print_text(args, csq, -1,0); + return; + } + assert( ngt<=2 ); + + if ( args->output_type==FT_TAB_TEXT ) + { + for (i=0; ismpl->n; i++) + { + int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; + for (j=0; jsmpl->idx[i],j+1); + } + } + return; + } + + vrec_t *vrec = csq->vrec; + for (i=0; ismpl->n; i++) + { + int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; + for (j=0; jidx + j; + if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + { + int ismpl = args->smpl->idx[i]; + int print_warning = 1; + if ( args->quiet ) + { + if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; + args->ncsq_small_warned = 1; + } + if ( print_warning ) + { + fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", + args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); + if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n"); + } + break; + } + if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; + vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + } + } +} +int test_utr(args_t *args, bcf1_t *rec) +{ + const char *chr = bcf_seqname(args->hdr,rec); + // note that the off-by-one extension of rlen is deliberate to account for insertions + if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + + splice_t splice; + splice_init(&splice, rec); + + int i, ret = 0; + while ( regitr_overlap(args->itr) ) + { + gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); + tscript_t *tr = splice.tr = utr->tr; + for (i=1; in_allele; i++) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + splice.vcf.alt = rec->d.allele[i]; + int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); + if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); + ret = 1; + } + } + assert(!splice.kref.s); + assert(!splice.kalt.s); + return ret; +} +int test_splice(args_t *args, bcf1_t *rec) +{ + const char *chr = bcf_seqname(args->hdr,rec); + if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; + + splice_t splice; + splice_init(&splice, rec); + splice.check_acceptor = splice.check_donor = 1; + + int i, ret = 0; + while ( regitr_overlap(args->itr) ) + { + gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); + splice.tr = exon->tr; + if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites + + splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; + splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; + + for (i=1; in_allele; i++) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + splice.vcf.alt = rec->d.allele[i]; + splice_csq(args, &splice, exon->beg, exon->end); + if ( splice.csq ) ret = 1; + } + } + free(splice.kref.s); + free(splice.kalt.s); + return ret; +} +int test_tscript(args_t *args, bcf1_t *rec) +{ + const char *chr = bcf_seqname(args->hdr,rec); + if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + + splice_t splice; + splice_init(&splice, rec); + + int i, ret = 0; + while ( regitr_overlap(args->itr) ) + { + tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + for (i=1; in_allele; i++) + { + if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } + splice.vcf.alt = rec->d.allele[i]; + int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); + if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF + csq_t csq; + memset(&csq, 0, sizeof(csq_t)); + csq.pos = rec->pos; + csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING; + csq.type.biotype = tr->type; + csq.type.strand = tr->strand; + csq.type.trid = tr->id; + csq.type.gene = tr->gene->name; + csq_stage(args, &csq, rec); + ret = 1; + } + } + assert(!splice.kref.s); + assert(!splice.kalt.s); + return ret; +} + +void process(args_t *args, bcf1_t **rec_ptr) +{ + if ( !rec_ptr ) + { + hap_flush(args, REGIDX_MAX); + vbuf_flush(args); + return; + } + + bcf1_t *rec = *rec_ptr; + + int call_csq = 1; + if ( !rec->n_allele ) call_csq = 0; // no alternate allele + else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele + else if ( args->filter ) + { + call_csq = filter_test(args->filter, rec, NULL); + if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; + } + if ( !call_csq ) + { + if ( !args->out_fh ) return; // not a VCF output + vbuf_push(args, rec_ptr); + vbuf_flush(args); + return; + } + + if ( args->rid != rec->rid ) + { + hap_flush(args, REGIDX_MAX); + vbuf_flush(args); + } + args->rid = rec->rid; + vbuf_push(args, rec_ptr); + + int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec); + hit += test_utr(args, rec); + hit += test_splice(args, rec); + if ( !hit ) test_tscript(args, rec); + + hap_flush(args, rec->pos-1); + vbuf_flush(args); + + return; +} + +const char *usage(void) +{ + return + "\n" + "About: Haplotype-aware consequence caller.\n" + "Usage: bcftools csq [options] in.vcf\n" + "\n" + "Required options:\n" + " -f, --fasta-ref reference file in fasta format\n" + " -g, --gff-annot gff3 annotation file\n" + "\n" + "CSQ options:\n" + " -c, --custom-tag use this tag instead of the default BCSQ\n" + " -l, --local-csq localized predictions, consider only one VCF record at a time\n" + " -n, --ncsq maximum number of consequences to consider per site [16]\n" + " -p, --phase how to construct haplotypes and how to deal with unphased data: [r]\n" + " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" + " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" + " r: require phased GTs, throw an error on unphased het GTs\n" + " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" + " s: skip unphased GTs\n" + "Options:\n" + " -e, --exclude exclude sites for which the expression is true\n" + " -i, --include select sites for which the expression is true\n" + " -o, --output write output to a file [standard output]\n" + " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" + " -q, --quiet suppress warning messages. Can be given two times for even less messages\n" + " -r, --regions restrict to comma-separated list of regions\n" + " -R, --regions-file restrict to regions listed in a file\n" + " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file samples to include\n" + " -t, --targets similar to -r but streams rather than index-jumps\n" + " -T, --targets-file similar to -R but streams rather than index-jumps\n" + "\n" + "Example:\n" + " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" + "\n" + " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n" + " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n" + " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n" + "\n"; +} + +int main_csq(int argc, char *argv[]) +{ + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->output_type = FT_VCF; + args->bcsq_tag = "BCSQ"; + args->ncsq_max = 2*16; + + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"ncsq",1,0,'n'}, + {"custom-tag",1,0,'c'}, + {"local-csq",0,0,'l'}, + {"gff-annot",1,0,'g'}, + {"fasta-ref",1,0,'f'}, + {"include",1,0,'i'}, + {"exclude",1,0,'e'}, + {"output",1,0,'o'}, + {"output-type",1,NULL,'O'}, + {"phase",1,0,'p'}, + {"quiet",0,0,'q'}, + {"regions",1,0,'r'}, + {"regions-file",1,0,'R'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, + {"targets",1,0,'t'}, + {"targets-file",1,0,'T'}, + {0,0,0,0} + }; + int c, targets_is_file = 0, regions_is_file = 0; + char *targets_list = NULL, *regions_list = NULL; + while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0) + { + switch (c) + { + case 'l': args->local_csq = 1; break; + case 'c': args->bcsq_tag = optarg; break; + case 'q': args->quiet++; break; + case 'p': + switch (optarg[0]) + { + case 'a': args->phase = PHASE_AS_IS; break; + case 'm': args->phase = PHASE_MERGE; break; + case 'r': args->phase = PHASE_REQUIRE; break; + case 'R': args->phase = PHASE_NON_REF; break; + case 's': args->phase = PHASE_SKIP; break; + default: error("The -p code \"%s\" not recognised\n", optarg); + } + break; + case 'f': args->fa_fname = optarg; break; + case 'g': args->gff_fname = optarg; break; + case 'n': + args->ncsq_max = 2 * atoi(optarg); + if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg); + break; + case 'o': args->output_fname = optarg; break; + case 'O': + switch (optarg[0]) { + case 't': args->output_type = FT_TAB_TEXT; break; + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: error("The output type \"%s\" not recognised\n", optarg); + } + break; + case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'r': regions_list = optarg; break; + case 'R': regions_list = optarg; regions_is_file = 1; break; + case 's': args->sample_list = optarg; break; + case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; + case 't': targets_list = optarg; break; + case 'T': targets_list = optarg; targets_is_file = 1; break; + case 'h': + case '?': error("%s",usage()); + default: error("The option not recognised: %s\n\n", optarg); break; + } + } + char *fname = NULL; + if ( optind==argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + else error("%s", usage()); + } + else fname = argv[optind]; + if ( argc - optind>1 ) error("%s", usage()); + if ( !args->fa_fname ) error("Missing the --fa-ref option\n"); + if ( !args->gff_fname ) error("Missing the --gff option\n"); + args->sr = bcf_sr_init(); + if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", targets_list); + if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", regions_list); + if ( !bcf_sr_add_reader(args->sr, fname) ) + error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum)); + args->hdr = bcf_sr_get_header(args->sr,0); + + init_data(args); + while ( bcf_sr_next_line(args->sr) ) + { + process(args, &args->sr->readers[0].buffer[0]); + } + process(args,NULL); + + destroy_data(args); + bcf_sr_destroy(args->sr); + free(args); + + return 0; +} + diff --git a/bcftools/filter.c b/bcftools/filter.c index c56ae6d..463028f 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -24,6 +24,7 @@ THE SOFTWARE. */ #include #include +#include #include #include #include @@ -34,13 +35,37 @@ THE SOFTWARE. */ #include #include +#ifndef __FUNCTION__ +# define __FUNCTION__ __func__ +#endif + +uint64_t bcf_double_missing = 0x7ff0000000000001; +uint64_t bcf_double_vector_end = 0x7ff0000000000002; +static inline void bcf_double_set(double *ptr, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.i = value; + *ptr = u.d; +} +static inline int bcf_double_test(double d, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.d = d; + return u.i==value ? 1 : 0; +} +#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) +#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) +#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) +#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) + + typedef struct _token_t { // read-only values, same for all VCF lines int tok_type; // one of the TOK_* keys below char *key; // set only for string constants, otherwise NULL char *tag; // for debugging and printout only, VCF tag name - float threshold; // filtering threshold + double threshold; // filtering threshold int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*]) void (*setter)(filter_t *, bcf1_t *, struct _token_t *); @@ -49,7 +74,7 @@ typedef struct _token_t regex_t *regex; // precompiled regex for string comparison // modified on filter evaluation at each VCF line - float *values; // In case str_value is set, values[0] is one sample's string length + double *values; // In case str_value is set, values[0] is one sample's string length char *str_value; // and values[0]*nsamples gives the total length; int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass @@ -67,7 +92,8 @@ struct _filter_t int nfilters; token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack int32_t *tmpi; - int max_unpack, mtmpi, nsamples; + float *tmpf; + int max_unpack, mtmpi, mtmpf, nsamples; }; @@ -221,13 +247,15 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; else { - tok->values[0] = line->qual; + tok->values[0] = (double)line->qual; tok->nvalues = 1; } } static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok) { tok->values[0] = bcf_get_variant_types(line); + if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1 + else tok->values[0] = ((int)tok->values[0]) << 1; tok->nvalues = 1; } static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) @@ -272,6 +300,13 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) tok->str_value = NULL; } } +static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +{ + int a = (int)(atok->nvalues?atok->values[0]:atok->threshold); + int b = (int)(btok->nvalues?btok->values[0]:btok->threshold); + if ( op_type==TOK_LIKE ) return a&b ? 1 : 0; + return a&b ? 0 : 1; +} static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line) { int i; @@ -316,7 +351,7 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin } /** - * bcf_get_info_value() - get single INFO value, int or float + * bcf_get_info_value() - get single INFO value, int64_t or double * @line: BCF line * @info_id: tag ID, as returned by bcf_hdr_id2int * @ivec: 0-based index to retrieve, -1 when single value is expected @@ -336,8 +371,8 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) bcf_info_t *info = &line->d.info[j]; if ( info->len == 1 ) { - if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f; - else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i; + if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f; + else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i; return 1; } @@ -354,10 +389,10 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) return 1; \ } switch (info->type) { - case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break; - case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break; - case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break; - case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break; + case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break; + case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break; + case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break; default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH @@ -374,14 +409,18 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); - tok->nvalues = n; - hts_expand(float,n,tok->mvalues,tok->values); - for (i=0; ivalues[i] = flt->tmpi[i]; + int i; + tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); + if ( tok->nvalues<=0 ) tok->nvalues = 0; + else + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + for (i=0; invalues; i++) tok->values[i] = flt->tmpi[i]; + } } else { - int32_t value; + int64_t value; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else @@ -396,12 +435,20 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues); - if ( tok->nvalues<0 ) tok->nvalues = 0; + int i; + tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf); + if ( tok->nvalues<=0 ) tok->nvalues = 0; + else + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + for (i=0; invalues; i++) + if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]); + else tok->values[i] = flt->tmpf[i]; + } } else { - float value; + double value; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else @@ -460,11 +507,11 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) else { int is_missing = 1; - hts_expand(float,tok->nvalues,tok->mvalues,tok->values); + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); for (i=0; invalues; i++) { if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end ) - bcf_float_set_missing(tok->values[i]); + bcf_double_set_missing(tok->values[i]); else { tok->values[i] = flt->tmpi[i]; @@ -490,20 +537,38 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { - if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 ) + int i; + if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 ) + { tok->nvalues = tok->nsamples = 0; // missing values - else if ( tok->idx >= 0 ) + } + else { - int i, nsmpl, nvals; - nsmpl = bcf_hdr_nsamples(flt->hdr); - nvals = tok->nvalues / nsmpl; - if ( tok->idx >= nvals ) - tok->nsamples = tok->nvalues = 0; // the index is too big - else + int is_missing = 1; + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + for (i=0; invalues; i++) { - for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; - tok->nsamples = tok->nvalues = nsmpl; + if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) ) + bcf_double_set_missing(tok->values[i]); + else + { + tok->values[i] = flt->tmpf[i]; + is_missing = 0; + } + } + if ( is_missing ) tok->nvalues = 0; + else if ( tok->idx >= 0 ) + { + int nsmpl = bcf_hdr_nsamples(flt->hdr); + int nvals = tok->nvalues / nsmpl; + if ( tok->idx >= nvals ) + tok->nvalues = 0; // the index is too big + else + { + for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; + tok->nvalues = nsmpl; + } } } tok->nsamples = tok->nvalues; @@ -567,7 +632,7 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to tok->nvalues = tok->nsamples = 0; return; } - int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr); + int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr); kstring_t str; gt_length_too_big: @@ -576,29 +641,15 @@ gt_length_too_big: { int plen = str.l; - #define BRANCH(type_t) { \ - type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \ - if ( !(ptr[0]>>1) ) kputc('.',&str); \ - } - switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t); break; - case BCF_BT_INT16: BRANCH(int16_t); break; - case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break; - } - #undef BRANCH - - if ( plen==str.l ) + bcf_format_gt(fmt, i, &str); + kputc_(0,&str); + if ( str.l - plen > blen ) { - bcf_format_gt(fmt, i, &str); - if ( str.l - plen > blen ) - { - // too many alternate alleles or ploidy is too large, the genotype does not fit - // three characters ("0/0" vs "10/10"). - tok->str_value = str.s; - blen *= 2; - goto gt_length_too_big; - } + // too many alternate alleles or ploidy is too large, the genotype does not fit + // three characters ("0/0" vs "10/10"). + tok->str_value = str.s; + blen *= 2; + goto gt_length_too_big; } plen = str.l - plen; @@ -680,7 +731,7 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok) } else { - hts_expand(float,line->n_allele,tok->mvalues,tok->values); + hts_expand(double,line->n_allele,tok->mvalues,tok->values); for (i=1; in_allele; i++) tok->values[i-1] = flt->tmpi[i]; tok->nvalues = line->n_allele - 1; @@ -706,7 +757,7 @@ static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok) if ( !tok->nvalues ) return; int i, an = flt->tmpi[0]; for (i=0; invalues; i++) - tok->values[i] /= (float)an; + tok->values[i] /= (double)an; } static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -715,18 +766,18 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) int i, an = flt->tmpi[0]; for (i=0; invalues; i++) { - tok->values[i] /= (float)an; + tok->values[i] /= (double)an; if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i]; } } static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = -HUGE_VAL; + double val = -HUGE_VAL; int i; for (i=0; invalues; i++) { - if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; + if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; } tok->values[0] = val; tok->nvalues = 1; @@ -734,30 +785,30 @@ static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) } static void set_min(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = HUGE_VAL; + double val = HUGE_VAL; int i; for (i=0; invalues; i++) - if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; + if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; } static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = 0; + double val = 0; int i, n = 0; for (i=0; invalues; i++) - if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } + if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } tok->values[0] = n ? val / n : 0; tok->nvalues = 1; tok->nsamples = 0; } static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = 0; + double val = 0; int i, n = 0; for (i=0; invalues; i++) - if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } + if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; @@ -812,20 +863,20 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { \ for (i=0; i<(atok)->nvalues; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[i]) ) continue; \ - if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \ + if ( bcf_double_is_missing((atok)->values[i]) ) continue; \ + if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \ has_values = 1; \ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \ } \ } \ else if ( (btok)->nsamples ) \ { \ - hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ + hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ for (i=0; i<(btok)->nvalues; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \ + if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \ { \ - bcf_float_set_missing((atok)->values[i]); \ + bcf_double_set_missing((atok)->values[i]); \ continue; \ } \ has_values = 1; \ @@ -838,9 +889,9 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { \ for (i=0; i<(atok)->nvalues; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \ + if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \ { \ - bcf_float_set_missing((atok)->values[i]); \ + bcf_double_set_missing((atok)->values[i]); \ continue; \ } \ has_values = 1; \ @@ -921,10 +972,14 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) for (i=0; insamples; i++) atok->pass_samples[i] = btok->pass_samples[i]; atok->nsamples = btok->nsamples; + atok->nvalues = 1; return btok->pass_site; } if ( !btok->nvalues ) // missing value in b + { + btok->nvalues = 1; return atok->pass_site; + } if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site; if ( !atok->nsamples ) @@ -978,6 +1033,7 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \ token_t *tok = (atok)->is_missing ? (btok) : (atok); \ (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \ + tok->nvalues = 1; \ } #define CMP_VECTORS(atok,btok,CMP_OP,ret) \ @@ -990,8 +1046,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) { \ for (i=0; i<(atok)->nsamples; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ has_values = 1; \ if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ else (atok)->pass_samples[i] = 0; \ @@ -1000,34 +1054,26 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) } \ else if ( (atok)->nsamples ) \ { \ - if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ - else \ + for (i=0; i<(atok)->nsamples; i++) \ { \ - for (i=0; i<(atok)->nsamples; i++) \ - { \ - if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - has_values = 1; \ - if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ - } \ + /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \ + has_values = 1; \ + if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ + else (atok)->pass_samples[i] = 0; \ } \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (btok)->nsamples ) \ { \ - if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ - else \ + for (i=0; i<(btok)->nsamples; i++) \ { \ - for (i=0; i<(btok)->nsamples; i++) \ - { \ - if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - has_values = 1; \ - if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ - } \ - (atok)->nvalues = (btok)->nvalues; \ - (atok)->nsamples = (btok)->nsamples; \ + if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ + has_values = 1; \ + if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ + else (atok)->pass_samples[i] = 0; \ } \ + (atok)->nvalues = (btok)->nvalues; \ + (atok)->nsamples = (btok)->nsamples; \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \ @@ -1124,10 +1170,23 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log } return pass_site; } -static int regex_vector_strings(token_t *atok, token_t *btok) +static int regex_vector_strings(token_t *atok, token_t *btok, int negate) { - int ret = regexec(btok->regex, atok->str_value, 0,NULL,0); - return ret==0 ? 1 : 0; + int i, pass_site = 0; + if ( atok->nsamples ) + { + for (i=0; insamples; i++) + { + char *ptr = atok->str_value + i*(int)atok->values[0]; + atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1; + if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; + pass_site |= atok->pass_samples[i]; + } + return pass_site; + } + pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1; + if ( negate ) pass_site = pass_site ? 0 : 1; + return pass_site; } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) @@ -1143,7 +1202,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) int quote = str[0]; if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str); tok->key = (char*) calloc(len-1,sizeof(char)); - hts_expand(float,1,tok->mvalues,tok->values); + hts_expand(double,1,tok->mvalues,tok->values); tok->values[0] = len-2; memcpy(tok->key,str+1,len-2); tok->key[len-2] = 0; @@ -1372,11 +1431,18 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) return 0; } - // is it a value? + // is it a value? Here we parse as integer/float separately and use strtof + // rather than strtod, because the more accurate double representation + // would invalidate floating point comparisons like QUAL=59.2, obtained via + // htslib/vcf parser char *end; - errno = 0; - tok->threshold = strtod(tmp.s, &end); - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + tok->threshold = strtol(tmp.s, &end, 10); // integer? + if ( end - tmp.s != strlen(tmp.s) ) + { + errno = 0; + tok->threshold = strtof(tmp.s, &end); // float? + if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + } if ( tmp.s ) free(tmp.s); return 0; @@ -1511,11 +1577,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) // Look for j="." and k numeric type int j = i-1, k = i-2; if ( !out[j].is_str ) { k = i-1, j = i-2; } - if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) ) + if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) ) { int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); - if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); } - if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); } + if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } + if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } } if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE ) @@ -1524,7 +1590,14 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( !out[j].key ) error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str); out[j].regex = (regex_t *) malloc(sizeof(regex_t)); - if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) ) + int cflags = REG_NOSUB; + int len = strlen(out[j].key); + if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' ) + { + out[j].key[len-2] = 0; + cflags |= REG_ICASE; + } + if ( regcomp(out[j].regex, out[j].key, cflags) ) error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str); } if ( out[i].tok_type!=TOK_VAL ) continue; @@ -1532,41 +1605,47 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( !strcmp(out[i].tag,"TYPE") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int j = i+1; - if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; - if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); - if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; } - else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str); - out[j].tag = out[j].key; out[j].key = NULL; - i = j; + int itok, ival; + if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1; + else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1; + else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1; + else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1; + else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } + else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); + if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; + out[ival].tag = out[ival].key; out[ival].key = NULL; + i = itok; continue; } if ( !strcmp(out[i].tag,"FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int j = i+1; - if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value" - if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and != - if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE; - if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ; - if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE; - if ( out[j].tok_type!=TOK_VAL || !out[j].key ) + int itok = i, ival; + if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1; + else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1; + else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1; + else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i; + else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i; + else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i; + else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + if ( out[ival].tok_type!=TOK_VAL || !out[ival].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); - if ( strcmp(".",out[j].key) ) + if ( strcmp(".",out[ival].key) ) { - out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key); - if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) ) - error("The filter \"%s\" not present in the VCF header\n", out[j].key); + out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key); + if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) ) + error("The filter \"%s\" not present in the VCF header\n", out[ival].key); } else - out[j].hdr_id = -1; - out[j].tag = out[j].key; out[j].key = NULL; - out[i].hdr_id = out[j].hdr_id; - i = j; + out[ival].hdr_id = -1; + out[ival].tag = out[ival].key; out[ival].key = NULL; + out[itok].hdr_id = out[ival].hdr_id; continue; } } @@ -1579,7 +1658,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; } - hts_expand0(float,1,out[i].mvalues,out[i].values); + hts_expand0(double,1,out[i].mvalues,out[i].values); if ( filter->nsamples ) { out[i].pass_samples = (uint8_t*)malloc(filter->nsamples); @@ -1618,6 +1697,7 @@ void filter_destroy(filter_t *filter) free(filter->flt_stack); free(filter->str); free(filter->tmpi); + free(filter->tmpf); free(filter); } @@ -1704,7 +1784,9 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) } int is_true = 0; - if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) + if ( filter->filters[i].comparator ) + is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line); + else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) { int skip = 0; if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1; @@ -1746,10 +1828,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE ) { if ( is_str==2 ) - { - is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]); - if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1; - } + is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1); else error("The regex operator can be used on strings only: %s\n", filter->str); } diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 531339e..44046f2 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -26,6 +26,7 @@ THE SOFTWARE. */ #include #include +#include #include #include #include @@ -36,13 +37,37 @@ THE SOFTWARE. */ #include #include +#ifndef __FUNCTION__ +# define __FUNCTION__ __func__ +#endif + +uint64_t bcf_double_missing = 0x7ff0000000000001; +uint64_t bcf_double_vector_end = 0x7ff0000000000002; +static inline void bcf_double_set(double *ptr, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.i = value; + *ptr = u.d; +} +static inline int bcf_double_test(double d, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.d = d; + return u.i==value ? 1 : 0; +} +#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) +#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) +#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) +#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) + + typedef struct _token_t { // read-only values, same for all VCF lines int tok_type; // one of the TOK_* keys below char *key; // set only for string constants, otherwise NULL char *tag; // for debugging and printout only, VCF tag name - float threshold; // filtering threshold + double threshold; // filtering threshold int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*]) void (*setter)(filter_t *, bcf1_t *, struct _token_t *); @@ -51,7 +76,7 @@ typedef struct _token_t regex_t *regex; // precompiled regex for string comparison // modified on filter evaluation at each VCF line - float *values; // In case str_value is set, values[0] is one sample's string length + double *values; // In case str_value is set, values[0] is one sample's string length char *str_value; // and values[0]*nsamples gives the total length; int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass @@ -69,7 +94,8 @@ struct _filter_t int nfilters; token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack int32_t *tmpi; - int max_unpack, mtmpi, nsamples; + float *tmpf; + int max_unpack, mtmpi, mtmpf, nsamples; }; @@ -223,13 +249,15 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; else { - tok->values[0] = line->qual; + tok->values[0] = (double)line->qual; tok->nvalues = 1; } } static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok) { tok->values[0] = bcf_get_variant_types(line); + if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1 + else tok->values[0] = ((int)tok->values[0]) << 1; tok->nvalues = 1; } static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) @@ -274,6 +302,13 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) tok->str_value = NULL; } } +static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +{ + int a = (int)(atok->nvalues?atok->values[0]:atok->threshold); + int b = (int)(btok->nvalues?btok->values[0]:btok->threshold); + if ( op_type==TOK_LIKE ) return a&b ? 1 : 0; + return a&b ? 0 : 1; +} static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line) { int i; @@ -318,7 +353,7 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin } /** - * bcf_get_info_value() - get single INFO value, int or float + * bcf_get_info_value() - get single INFO value, int64_t or double * @line: BCF line * @info_id: tag ID, as returned by bcf_hdr_id2int * @ivec: 0-based index to retrieve, -1 when single value is expected @@ -338,8 +373,8 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) bcf_info_t *info = &line->d.info[j]; if ( info->len == 1 ) { - if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f; - else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i; + if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f; + else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i; return 1; } @@ -356,10 +391,10 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) return 1; \ } switch (info->type) { - case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break; - case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break; - case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break; - case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break; + case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break; + case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break; + case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break; default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH @@ -376,14 +411,18 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); - tok->nvalues = n; - hts_expand(float,n,tok->mvalues,tok->values); - for (i=0; ivalues[i] = flt->tmpi[i]; + int i; + tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); + if ( tok->nvalues<=0 ) tok->nvalues = 0; + else + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + for (i=0; invalues; i++) tok->values[i] = flt->tmpi[i]; + } } else { - int32_t value; + int64_t value; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else @@ -398,12 +437,20 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues); - if ( tok->nvalues<0 ) tok->nvalues = 0; + int i; + tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf); + if ( tok->nvalues<=0 ) tok->nvalues = 0; + else + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + for (i=0; invalues; i++) + if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]); + else tok->values[i] = flt->tmpf[i]; + } } else { - float value; + double value; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else @@ -462,11 +509,11 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) else { int is_missing = 1; - hts_expand(float,tok->nvalues,tok->mvalues,tok->values); + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); for (i=0; invalues; i++) { if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end ) - bcf_float_set_missing(tok->values[i]); + bcf_double_set_missing(tok->values[i]); else { tok->values[i] = flt->tmpi[i]; @@ -492,20 +539,38 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { - if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 ) + int i; + if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 ) + { tok->nvalues = tok->nsamples = 0; // missing values - else if ( tok->idx >= 0 ) + } + else { - int i, nsmpl, nvals; - nsmpl = bcf_hdr_nsamples(flt->hdr); - nvals = tok->nvalues / nsmpl; - if ( tok->idx >= nvals ) - tok->nsamples = tok->nvalues = 0; // the index is too big - else + int is_missing = 1; + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + for (i=0; invalues; i++) { - for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; - tok->nsamples = tok->nvalues = nsmpl; + if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) ) + bcf_double_set_missing(tok->values[i]); + else + { + tok->values[i] = flt->tmpf[i]; + is_missing = 0; + } + } + if ( is_missing ) tok->nvalues = 0; + else if ( tok->idx >= 0 ) + { + int nsmpl = bcf_hdr_nsamples(flt->hdr); + int nvals = tok->nvalues / nsmpl; + if ( tok->idx >= nvals ) + tok->nvalues = 0; // the index is too big + else + { + for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; + tok->nvalues = nsmpl; + } } } tok->nsamples = tok->nvalues; @@ -569,7 +634,7 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to tok->nvalues = tok->nsamples = 0; return; } - int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr); + int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr); kstring_t str; gt_length_too_big: @@ -578,29 +643,15 @@ gt_length_too_big: { int plen = str.l; - #define BRANCH(type_t) { \ - type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \ - if ( !(ptr[0]>>1) ) kputc('.',&str); \ - } - switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t); break; - case BCF_BT_INT16: BRANCH(int16_t); break; - case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(pysam_stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break; - } - #undef BRANCH - - if ( plen==str.l ) + bcf_format_gt(fmt, i, &str); + kputc_(0,&str); + if ( str.l - plen > blen ) { - bcf_format_gt(fmt, i, &str); - if ( str.l - plen > blen ) - { - // too many alternate alleles or ploidy is too large, the genotype does not fit - // three characters ("0/0" vs "10/10"). - tok->str_value = str.s; - blen *= 2; - goto gt_length_too_big; - } + // too many alternate alleles or ploidy is too large, the genotype does not fit + // three characters ("0/0" vs "10/10"). + tok->str_value = str.s; + blen *= 2; + goto gt_length_too_big; } plen = str.l - plen; @@ -682,7 +733,7 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok) } else { - hts_expand(float,line->n_allele,tok->mvalues,tok->values); + hts_expand(double,line->n_allele,tok->mvalues,tok->values); for (i=1; in_allele; i++) tok->values[i-1] = flt->tmpi[i]; tok->nvalues = line->n_allele - 1; @@ -708,7 +759,7 @@ static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok) if ( !tok->nvalues ) return; int i, an = flt->tmpi[0]; for (i=0; invalues; i++) - tok->values[i] /= (float)an; + tok->values[i] /= (double)an; } static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -717,18 +768,18 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) int i, an = flt->tmpi[0]; for (i=0; invalues; i++) { - tok->values[i] /= (float)an; + tok->values[i] /= (double)an; if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i]; } } static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = -HUGE_VAL; + double val = -HUGE_VAL; int i; for (i=0; invalues; i++) { - if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; + if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; } tok->values[0] = val; tok->nvalues = 1; @@ -736,30 +787,30 @@ static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) } static void set_min(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = HUGE_VAL; + double val = HUGE_VAL; int i; for (i=0; invalues; i++) - if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; + if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; } static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = 0; + double val = 0; int i, n = 0; for (i=0; invalues; i++) - if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } + if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } tok->values[0] = n ? val / n : 0; tok->nvalues = 1; tok->nsamples = 0; } static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok) { - float val = 0; + double val = 0; int i, n = 0; for (i=0; invalues; i++) - if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } + if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; @@ -814,20 +865,20 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { \ for (i=0; i<(atok)->nvalues; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[i]) ) continue; \ - if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \ + if ( bcf_double_is_missing((atok)->values[i]) ) continue; \ + if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \ has_values = 1; \ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \ } \ } \ else if ( (btok)->nsamples ) \ { \ - hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ + hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ for (i=0; i<(btok)->nvalues; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \ + if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \ { \ - bcf_float_set_missing((atok)->values[i]); \ + bcf_double_set_missing((atok)->values[i]); \ continue; \ } \ has_values = 1; \ @@ -840,9 +891,9 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { \ for (i=0; i<(atok)->nvalues; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \ + if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \ { \ - bcf_float_set_missing((atok)->values[i]); \ + bcf_double_set_missing((atok)->values[i]); \ continue; \ } \ has_values = 1; \ @@ -923,10 +974,14 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) for (i=0; insamples; i++) atok->pass_samples[i] = btok->pass_samples[i]; atok->nsamples = btok->nsamples; + atok->nvalues = 1; return btok->pass_site; } if ( !btok->nvalues ) // missing value in b + { + btok->nvalues = 1; return atok->pass_site; + } if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site; if ( !atok->nsamples ) @@ -980,6 +1035,7 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \ token_t *tok = (atok)->is_missing ? (btok) : (atok); \ (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \ + tok->nvalues = 1; \ } #define CMP_VECTORS(atok,btok,CMP_OP,ret) \ @@ -992,8 +1048,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) { \ for (i=0; i<(atok)->nsamples; i++) \ { \ - if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ has_values = 1; \ if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ else (atok)->pass_samples[i] = 0; \ @@ -1002,34 +1056,26 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) } \ else if ( (atok)->nsamples ) \ { \ - if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ - else \ + for (i=0; i<(atok)->nsamples; i++) \ { \ - for (i=0; i<(atok)->nsamples; i++) \ - { \ - if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - has_values = 1; \ - if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ - } \ + /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \ + has_values = 1; \ + if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ + else (atok)->pass_samples[i] = 0; \ } \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (btok)->nsamples ) \ { \ - if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ - else \ + for (i=0; i<(btok)->nsamples; i++) \ { \ - for (i=0; i<(btok)->nsamples; i++) \ - { \ - if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - has_values = 1; \ - if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ - } \ - (atok)->nvalues = (btok)->nvalues; \ - (atok)->nsamples = (btok)->nsamples; \ + if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ + has_values = 1; \ + if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ + else (atok)->pass_samples[i] = 0; \ } \ + (atok)->nvalues = (btok)->nvalues; \ + (atok)->nsamples = (btok)->nsamples; \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \ @@ -1126,10 +1172,23 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log } return pass_site; } -static int regex_vector_strings(token_t *atok, token_t *btok) +static int regex_vector_strings(token_t *atok, token_t *btok, int negate) { - int ret = regexec(btok->regex, atok->str_value, 0,NULL,0); - return ret==0 ? 1 : 0; + int i, pass_site = 0; + if ( atok->nsamples ) + { + for (i=0; insamples; i++) + { + char *ptr = atok->str_value + i*(int)atok->values[0]; + atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1; + if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; + pass_site |= atok->pass_samples[i]; + } + return pass_site; + } + pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1; + if ( negate ) pass_site = pass_site ? 0 : 1; + return pass_site; } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) @@ -1145,7 +1204,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) int quote = str[0]; if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str); tok->key = (char*) calloc(len-1,sizeof(char)); - hts_expand(float,1,tok->mvalues,tok->values); + hts_expand(double,1,tok->mvalues,tok->values); tok->values[0] = len-2; memcpy(tok->key,str+1,len-2); tok->key[len-2] = 0; @@ -1374,11 +1433,18 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) return 0; } - // is it a value? + // is it a value? Here we parse as integer/float separately and use strtof + // rather than strtod, because the more accurate double representation + // would invalidate floating point comparisons like QUAL=59.2, obtained via + // htslib/vcf parser char *end; - errno = 0; - tok->threshold = strtod(tmp.s, &end); - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + tok->threshold = strtol(tmp.s, &end, 10); // integer? + if ( end - tmp.s != strlen(tmp.s) ) + { + errno = 0; + tok->threshold = strtof(tmp.s, &end); // float? + if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + } if ( tmp.s ) free(tmp.s); return 0; @@ -1513,11 +1579,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) // Look for j="." and k numeric type int j = i-1, k = i-2; if ( !out[j].is_str ) { k = i-1, j = i-2; } - if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) ) + if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) ) { int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); - if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); } - if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); } + if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } + if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } } if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE ) @@ -1526,7 +1592,14 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( !out[j].key ) error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str); out[j].regex = (regex_t *) malloc(sizeof(regex_t)); - if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) ) + int cflags = REG_NOSUB; + int len = strlen(out[j].key); + if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' ) + { + out[j].key[len-2] = 0; + cflags |= REG_ICASE; + } + if ( regcomp(out[j].regex, out[j].key, cflags) ) error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str); } if ( out[i].tok_type!=TOK_VAL ) continue; @@ -1534,41 +1607,47 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( !strcmp(out[i].tag,"TYPE") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int j = i+1; - if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; - if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); - if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; } - else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; } - else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str); - out[j].tag = out[j].key; out[j].key = NULL; - i = j; + int itok, ival; + if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1; + else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1; + else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1; + else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1; + else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; } + else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } + else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); + if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; + out[ival].tag = out[ival].key; out[ival].key = NULL; + i = itok; continue; } if ( !strcmp(out[i].tag,"FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int j = i+1; - if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value" - if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and != - if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE; - if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ; - if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE; - if ( out[j].tok_type!=TOK_VAL || !out[j].key ) + int itok = i, ival; + if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1; + else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1; + else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1; + else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i; + else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i; + else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i; + else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + if ( out[ival].tok_type!=TOK_VAL || !out[ival].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); - if ( strcmp(".",out[j].key) ) + if ( strcmp(".",out[ival].key) ) { - out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key); - if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) ) - error("The filter \"%s\" not present in the VCF header\n", out[j].key); + out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key); + if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) ) + error("The filter \"%s\" not present in the VCF header\n", out[ival].key); } else - out[j].hdr_id = -1; - out[j].tag = out[j].key; out[j].key = NULL; - out[i].hdr_id = out[j].hdr_id; - i = j; + out[ival].hdr_id = -1; + out[ival].tag = out[ival].key; out[ival].key = NULL; + out[itok].hdr_id = out[ival].hdr_id; continue; } } @@ -1581,7 +1660,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; } - hts_expand0(float,1,out[i].mvalues,out[i].values); + hts_expand0(double,1,out[i].mvalues,out[i].values); if ( filter->nsamples ) { out[i].pass_samples = (uint8_t*)malloc(filter->nsamples); @@ -1620,6 +1699,7 @@ void filter_destroy(filter_t *filter) free(filter->flt_stack); free(filter->str); free(filter->tmpi); + free(filter->tmpf); free(filter); } @@ -1706,7 +1786,9 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) } int is_true = 0; - if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) + if ( filter->filters[i].comparator ) + is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line); + else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) { int skip = 0; if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1; @@ -1748,10 +1830,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE ) { if ( is_str==2 ) - { - is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]); - if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1; - } + is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1); else error("The regex operator can be used on strings only: %s\n", filter->str); } diff --git a/bcftools/hclust.c b/bcftools/hclust.c new file mode 100644 index 0000000..692fa54 --- /dev/null +++ b/bcftools/hclust.c @@ -0,0 +1,400 @@ +/* The MIT License + + Copyright (c) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include "bcftools.h" +#include "hclust.h" + +typedef struct _node_t +{ + struct _node_t *akid, *bkid, *next, *prev, *parent; + int id, idx; // id: unique node id; idx: current index to pdist + float value; // max pairwise dist of elements within the node +} +node_t; + +struct _hclust_t +{ + int ndat, nclust; // ndat: number of elements (pdist matrix size); nclust: current number of clusters + float *pdist; // pairwise cluster distances, diagonal matrix accessed via the PDIST macro + node_t *first, *last; // clusters are maintained in a double-linked list + node_t **rmme; // convenience array to remove all allocated nodes at the end + int nrmme; + kstring_t str; // (for debugging) pointer to str.s is returned by create_dot() + char **dbg; // (for debugging) created by create_list() via set_threshold() and returned by explain() + int ndbg, mdbg; +}; + +node_t *append_node(hclust_t *clust, int idx) +{ + node_t *node = (node_t*) calloc(1,sizeof(node_t)); + + clust->nclust++; + node->id = clust->nrmme; + node->idx = idx; + if ( !clust->first ) + { + clust->first = node; + clust->last = node; + } + else + { + node->prev = clust->last; + clust->last->next = node; + clust->last = node; + } + + if ( clust->nrmme >= clust->ndat*2 ) error("hclust fixme: %d vs %d\n",clust->nrmme,clust->ndat); + clust->rmme[clust->nrmme++] = node; + + return node; +} +void remove_node(hclust_t *clust, node_t *node) +{ + if ( node==clust->first ) clust->first = node->next; + if ( node==clust->last ) clust->last = node->prev; + if ( node->next ) node->next->prev = node->prev; + if ( node->prev ) node->prev->next = node->next; + clust->nclust--; +} + +#if DEBUG +void hclust_debug(hclust_t *clust) +{ + int i; + fprintf(stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust); + for (i=0; inrmme; i++) + { + node_t *node = clust->rmme[i]; + int akid = node->akid ? node->akid->id : -1; + int bkid = node->bkid ? node->bkid->id : -1; + int akidx = node->akid ? node->akid->idx : -1; + int bkidx = node->bkid ? node->bkid->idx : -1; + fprintf(stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx); + } + + int j; + for (i=1; indat; i++) + { + int active = 0; + node_t *node = clust->first; + while (node) + { + if ( node->idx==i ) { active=1; break; } + node = node->next; + } + fprintf(stderr,"%2d%c ",i,active?'*':' '); + for (j=0; jpdist,i,j)==9 ) + fprintf(stderr," ----- "); + else + fprintf(stderr," %f", PDIST(clust->pdist,i,j)); + } + fprintf(stderr,"\n"); + } + for (j=0; jndat-1; j++) fprintf(stderr," %6d ",j); fprintf(stderr,"\n"); +} +#endif + +hclust_t *hclust_init(int n, float *pdist) +{ + hclust_t *clust = (hclust_t*) calloc(1,sizeof(hclust_t)); + clust->ndat = n; + clust->pdist = pdist; + clust->rmme = (node_t**) calloc(n*2,sizeof(node_t*)); + + // init clusters + int i; + for (i=0; indat; i++) append_node(clust,i); + + // build the tree + while ( clust->nclust>1 ) + { + // find two clusters with minimum distance + float min_value = HUGE_VAL; + node_t *iclust = clust->first->next; + node_t *min_iclust = NULL, *min_jclust = NULL; + while ( iclust ) + { + node_t *jclust = clust->first; + while ( jclust!=iclust ) + { + float value = PDIST(clust->pdist,iclust->idx,jclust->idx); + if ( value < min_value ) + { + min_value = value; + min_iclust = iclust; + min_jclust = jclust; + } + jclust = jclust->next; + } + iclust = iclust->next; + } + assert( min_iclust && min_jclust ); // pdist contains inf or nan, fix the caller + remove_node(clust,min_iclust); + remove_node(clust,min_jclust); + + // update the pairwise distances. We keep the matrix and as we are moving up the + // tree, we use fewer columns/rows as the number of clusters decreases: we reuse + // i-th and leave j-th unused. Inter-cluster distance is defined as maximum distance + // between pairwise distances of elements within the cluster. + iclust = clust->first; + while ( iclust ) + { + if ( PDIST(clust->pdist,iclust->idx,min_iclust->idx) < PDIST(clust->pdist,iclust->idx,min_jclust->idx) ) + PDIST(clust->pdist,iclust->idx,min_iclust->idx) = PDIST(clust->pdist,iclust->idx,min_jclust->idx); + iclust = iclust->next; + } + + node_t *node = append_node(clust,min_iclust->idx); + node->akid = min_iclust; + node->bkid = min_jclust; + node->value = min_value; + node->akid->parent = node; + node->bkid->parent = node; + } + + return clust; +} +void hclust_destroy(hclust_t *clust) +{ + int i; + for (i=0; inrmme; i++) free(clust->rmme[i]); + free(clust->rmme); + free(clust->dbg); + free(clust->str.s); + free(clust); +} + +char *hclust_create_dot(hclust_t *clust, char **labels, float th) +{ + clust->str.l = 0; + ksprintf(&clust->str,"digraph myGraph {"); + + int i; + for (i=0; inrmme; i++) + { + node_t *node = clust->rmme[i]; + if ( node->value ) + ksprintf(&clust->str,"\"%d\" [label=\"%f\"];", node->id,node->value); + else + ksprintf(&clust->str,"\"%d\" [label=\"%s\"];", node->id,labels[node->idx]); + } + for (i=0; inrmme; i++) + { + node_t *node = clust->rmme[i]; + if ( node->akid ) + { + if ( node->value >= th && node->akid && node->akid->value < th ) + ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->akid->id); + else + ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->akid->id); + } + + if ( node->bkid ) + { + if ( node->value >= th && node->bkid && node->bkid->value < th ) + ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->bkid->id); + else + ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->bkid->id); + } + } + ksprintf(&clust->str,"};"); + return clust->str.s; +} +char **hclust_explain(hclust_t *clust, int *nlines) +{ + clust->ndbg = 0; + char *beg = clust->str.s; + while ( *beg ) + { + char *end = beg; + while ( *end && *end!='\n' ) end++; + clust->ndbg++; + hts_expand(char*,clust->ndbg,clust->mdbg,clust->dbg); + clust->dbg[clust->ndbg-1] = beg; + if ( !*end ) break; + *end = 0; + beg = end + 1; + } + + *nlines = clust->ndbg; + return clust->dbg; +} + +cluster_t *append_cluster(node_t *node, cluster_t *cluster, int *nclust, node_t **stack) +{ + (*nclust)++; + cluster = (cluster_t*) realloc(cluster,sizeof(cluster_t)*(*nclust)); + cluster_t *clust = &cluster[*nclust-1]; + clust->nmemb = 0; + clust->memb = NULL; + clust->dist = node->value; + + int nstack = 1; + stack[0] = node; + + while ( nstack ) + { + node_t *node = stack[--nstack]; + node_t *akid = node->akid; + node_t *bkid = node->bkid; + if ( node->akid ) + { + stack[nstack++] = akid; + stack[nstack++] = bkid; + } + else + { + clust->nmemb++; + clust->memb = (int*) realloc(clust->memb,sizeof(int)*clust->nmemb); + clust->memb[clust->nmemb-1] = node->id; + } + } + return cluster; +} + +int cmp_nodes(const void *a, const void *b) +{ + const node_t *an = *((const node_t**) a); + const node_t *bn = *((const node_t**) b); + if ( an->value < bn->value ) return -1; + if ( an->value > bn->value ) return 1; + return 0; +} + +float calc_dev(node_t **dat, int n) +{ + float avg = 0, dev = 0; + int i; + for (i=0; ivalue; + avg /= n; + for (i=0; ivalue - avg)*(dat[i]->value - avg); + return sqrt(dev/n); +} + +/* + Heuristics to determine clustering cutoff: sort nodes by distance and + split into two groups by minimizing the standard deviation. + This works best when two elements from a single different sample are + included in the mix. + - min_inter_dist .. smaller values are always considered identical + - max_intra_dist .. larger values are always considered different + */ +float hclust_set_threshold(hclust_t *clust, float min_inter_dist, float max_intra_dist) +{ + node_t **dat = clust->rmme + clust->ndat; + int i, ndat = clust->nrmme - clust->ndat; + + qsort(dat, ndat, sizeof(dat), cmp_nodes); + + clust->str.l = 0; + float th, min_dev = HUGE_VAL; + int imin = -1; + for (i=0; i0 ) dev += calc_dev(dat,i); + if ( i+1value; + ksprintf(&clust->str,"DEV\t%f\t%f\n",th,dev); + if ( min_dev > dev && th >= min_inter_dist ) { min_dev = dev; imin = i; } + } + if ( max_intra_dist > 0 ) + th = max_intra_dist; // use fixed cutoff, the above was only for debugging output + else + { + // dynamic cutoff + max_intra_dist = fabs(max_intra_dist); + th = imin==-1 ? max_intra_dist : dat[imin]->value; + if ( th > max_intra_dist ) th = max_intra_dist; + } + ksprintf(&clust->str,"TH\t%f\n", th); + ksprintf(&clust->str,"MAX_DIST\t%f\n", dat[ndat-1]->value); + ksprintf(&clust->str,"MIN_INTER\t%f\n", min_inter_dist); + ksprintf(&clust->str,"MAX_INTRA\t%f\n", max_intra_dist); + return th; +} + +cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust) +{ + float cutoff = *max_intra_dist = hclust_set_threshold(clust, min_inter_dist, *max_intra_dist); + + node_t **stack = (node_t**) malloc(sizeof(node_t*)*clust->ndat); + node_t **tmp = (node_t**) malloc(sizeof(node_t*)*clust->ndat); + stack[0] = clust->first; + int nstack = 1; + + cluster_t *cluster = NULL; + int ncluster = 0; + + if ( stack[0]->value < cutoff ) + { + // all values are within the limits - create a single cluster + cluster = append_cluster(stack[0], cluster, &ncluster, tmp); + nstack = 0; + } + + while ( nstack ) + { + node_t *node = stack[--nstack]; + node_t *akid = node->akid; + node_t *bkid = node->bkid; + if ( !akid ) + { + cluster = append_cluster(node, cluster, &ncluster, tmp); + continue; + } + + if ( node->value >= cutoff && akid->value < cutoff ) + cluster = append_cluster(akid, cluster, &ncluster, tmp); + else + stack[nstack++] = akid; + + if ( node->value >= cutoff && bkid->value < cutoff ) + cluster = append_cluster(bkid, cluster, &ncluster, tmp); + else + stack[nstack++] = bkid; + } + + free(tmp); + free(stack); + + *nclust = ncluster; + return cluster; +} + +void hclust_destroy_list(cluster_t *clust, int nclust) +{ + int i; + for (i=0; i + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include "bcftools.h" +#include "hclust.h" + +typedef struct _node_t +{ + struct _node_t *akid, *bkid, *next, *prev, *parent; + int id, idx; // id: unique node id; idx: current index to pdist + float value; // max pairwise dist of elements within the node +} +node_t; + +struct _hclust_t +{ + int ndat, nclust; // ndat: number of elements (pdist matrix size); nclust: current number of clusters + float *pdist; // pairwise cluster distances, diagonal matrix accessed via the PDIST macro + node_t *first, *last; // clusters are maintained in a double-linked list + node_t **rmme; // convenience array to remove all allocated nodes at the end + int nrmme; + kstring_t str; // (for debugging) pointer to str.s is returned by create_dot() + char **dbg; // (for debugging) created by create_list() via set_threshold() and returned by explain() + int ndbg, mdbg; +}; + +node_t *append_node(hclust_t *clust, int idx) +{ + node_t *node = (node_t*) calloc(1,sizeof(node_t)); + + clust->nclust++; + node->id = clust->nrmme; + node->idx = idx; + if ( !clust->first ) + { + clust->first = node; + clust->last = node; + } + else + { + node->prev = clust->last; + clust->last->next = node; + clust->last = node; + } + + if ( clust->nrmme >= clust->ndat*2 ) error("hclust fixme: %d vs %d\n",clust->nrmme,clust->ndat); + clust->rmme[clust->nrmme++] = node; + + return node; +} +void remove_node(hclust_t *clust, node_t *node) +{ + if ( node==clust->first ) clust->first = node->next; + if ( node==clust->last ) clust->last = node->prev; + if ( node->next ) node->next->prev = node->prev; + if ( node->prev ) node->prev->next = node->next; + clust->nclust--; +} + +#if DEBUG +void hclust_debug(hclust_t *clust) +{ + int i; + fprintf(pysam_stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust); + for (i=0; inrmme; i++) + { + node_t *node = clust->rmme[i]; + int akid = node->akid ? node->akid->id : -1; + int bkid = node->bkid ? node->bkid->id : -1; + int akidx = node->akid ? node->akid->idx : -1; + int bkidx = node->bkid ? node->bkid->idx : -1; + fprintf(pysam_stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx); + } + + int j; + for (i=1; indat; i++) + { + int active = 0; + node_t *node = clust->first; + while (node) + { + if ( node->idx==i ) { active=1; break; } + node = node->next; + } + fprintf(pysam_stderr,"%2d%c ",i,active?'*':' '); + for (j=0; jpdist,i,j)==9 ) + fprintf(pysam_stderr," ----- "); + else + fprintf(pysam_stderr," %f", PDIST(clust->pdist,i,j)); + } + fprintf(pysam_stderr,"\n"); + } + for (j=0; jndat-1; j++) fprintf(pysam_stderr," %6d ",j); fprintf(pysam_stderr,"\n"); +} +#endif + +hclust_t *hclust_init(int n, float *pdist) +{ + hclust_t *clust = (hclust_t*) calloc(1,sizeof(hclust_t)); + clust->ndat = n; + clust->pdist = pdist; + clust->rmme = (node_t**) calloc(n*2,sizeof(node_t*)); + + // init clusters + int i; + for (i=0; indat; i++) append_node(clust,i); + + // build the tree + while ( clust->nclust>1 ) + { + // find two clusters with minimum distance + float min_value = HUGE_VAL; + node_t *iclust = clust->first->next; + node_t *min_iclust = NULL, *min_jclust = NULL; + while ( iclust ) + { + node_t *jclust = clust->first; + while ( jclust!=iclust ) + { + float value = PDIST(clust->pdist,iclust->idx,jclust->idx); + if ( value < min_value ) + { + min_value = value; + min_iclust = iclust; + min_jclust = jclust; + } + jclust = jclust->next; + } + iclust = iclust->next; + } + assert( min_iclust && min_jclust ); // pdist contains inf or nan, fix the caller + remove_node(clust,min_iclust); + remove_node(clust,min_jclust); + + // update the pairwise distances. We keep the matrix and as we are moving up the + // tree, we use fewer columns/rows as the number of clusters decreases: we reuse + // i-th and leave j-th unused. Inter-cluster distance is defined as maximum distance + // between pairwise distances of elements within the cluster. + iclust = clust->first; + while ( iclust ) + { + if ( PDIST(clust->pdist,iclust->idx,min_iclust->idx) < PDIST(clust->pdist,iclust->idx,min_jclust->idx) ) + PDIST(clust->pdist,iclust->idx,min_iclust->idx) = PDIST(clust->pdist,iclust->idx,min_jclust->idx); + iclust = iclust->next; + } + + node_t *node = append_node(clust,min_iclust->idx); + node->akid = min_iclust; + node->bkid = min_jclust; + node->value = min_value; + node->akid->parent = node; + node->bkid->parent = node; + } + + return clust; +} +void hclust_destroy(hclust_t *clust) +{ + int i; + for (i=0; inrmme; i++) free(clust->rmme[i]); + free(clust->rmme); + free(clust->dbg); + free(clust->str.s); + free(clust); +} + +char *hclust_create_dot(hclust_t *clust, char **labels, float th) +{ + clust->str.l = 0; + ksprintf(&clust->str,"digraph myGraph {"); + + int i; + for (i=0; inrmme; i++) + { + node_t *node = clust->rmme[i]; + if ( node->value ) + ksprintf(&clust->str,"\"%d\" [label=\"%f\"];", node->id,node->value); + else + ksprintf(&clust->str,"\"%d\" [label=\"%s\"];", node->id,labels[node->idx]); + } + for (i=0; inrmme; i++) + { + node_t *node = clust->rmme[i]; + if ( node->akid ) + { + if ( node->value >= th && node->akid && node->akid->value < th ) + ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->akid->id); + else + ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->akid->id); + } + + if ( node->bkid ) + { + if ( node->value >= th && node->bkid && node->bkid->value < th ) + ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->bkid->id); + else + ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->bkid->id); + } + } + ksprintf(&clust->str,"};"); + return clust->str.s; +} +char **hclust_explain(hclust_t *clust, int *nlines) +{ + clust->ndbg = 0; + char *beg = clust->str.s; + while ( *beg ) + { + char *end = beg; + while ( *end && *end!='\n' ) end++; + clust->ndbg++; + hts_expand(char*,clust->ndbg,clust->mdbg,clust->dbg); + clust->dbg[clust->ndbg-1] = beg; + if ( !*end ) break; + *end = 0; + beg = end + 1; + } + + *nlines = clust->ndbg; + return clust->dbg; +} + +cluster_t *append_cluster(node_t *node, cluster_t *cluster, int *nclust, node_t **stack) +{ + (*nclust)++; + cluster = (cluster_t*) realloc(cluster,sizeof(cluster_t)*(*nclust)); + cluster_t *clust = &cluster[*nclust-1]; + clust->nmemb = 0; + clust->memb = NULL; + clust->dist = node->value; + + int nstack = 1; + stack[0] = node; + + while ( nstack ) + { + node_t *node = stack[--nstack]; + node_t *akid = node->akid; + node_t *bkid = node->bkid; + if ( node->akid ) + { + stack[nstack++] = akid; + stack[nstack++] = bkid; + } + else + { + clust->nmemb++; + clust->memb = (int*) realloc(clust->memb,sizeof(int)*clust->nmemb); + clust->memb[clust->nmemb-1] = node->id; + } + } + return cluster; +} + +int cmp_nodes(const void *a, const void *b) +{ + const node_t *an = *((const node_t**) a); + const node_t *bn = *((const node_t**) b); + if ( an->value < bn->value ) return -1; + if ( an->value > bn->value ) return 1; + return 0; +} + +float calc_dev(node_t **dat, int n) +{ + float avg = 0, dev = 0; + int i; + for (i=0; ivalue; + avg /= n; + for (i=0; ivalue - avg)*(dat[i]->value - avg); + return sqrt(dev/n); +} + +/* + Heuristics to determine clustering cutoff: sort nodes by distance and + split into two groups by minimizing the standard deviation. + This works best when two elements from a single different sample are + included in the mix. + - min_inter_dist .. smaller values are always considered identical + - max_intra_dist .. larger values are always considered different + */ +float hclust_set_threshold(hclust_t *clust, float min_inter_dist, float max_intra_dist) +{ + node_t **dat = clust->rmme + clust->ndat; + int i, ndat = clust->nrmme - clust->ndat; + + qsort(dat, ndat, sizeof(dat), cmp_nodes); + + clust->str.l = 0; + float th, min_dev = HUGE_VAL; + int imin = -1; + for (i=0; i0 ) dev += calc_dev(dat,i); + if ( i+1value; + ksprintf(&clust->str,"DEV\t%f\t%f\n",th,dev); + if ( min_dev > dev && th >= min_inter_dist ) { min_dev = dev; imin = i; } + } + if ( max_intra_dist > 0 ) + th = max_intra_dist; // use fixed cutoff, the above was only for debugging output + else + { + // dynamic cutoff + max_intra_dist = fabs(max_intra_dist); + th = imin==-1 ? max_intra_dist : dat[imin]->value; + if ( th > max_intra_dist ) th = max_intra_dist; + } + ksprintf(&clust->str,"TH\t%f\n", th); + ksprintf(&clust->str,"MAX_DIST\t%f\n", dat[ndat-1]->value); + ksprintf(&clust->str,"MIN_INTER\t%f\n", min_inter_dist); + ksprintf(&clust->str,"MAX_INTRA\t%f\n", max_intra_dist); + return th; +} + +cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust) +{ + float cutoff = *max_intra_dist = hclust_set_threshold(clust, min_inter_dist, *max_intra_dist); + + node_t **stack = (node_t**) malloc(sizeof(node_t*)*clust->ndat); + node_t **tmp = (node_t**) malloc(sizeof(node_t*)*clust->ndat); + stack[0] = clust->first; + int nstack = 1; + + cluster_t *cluster = NULL; + int ncluster = 0; + + if ( stack[0]->value < cutoff ) + { + // all values are within the limits - create a single cluster + cluster = append_cluster(stack[0], cluster, &ncluster, tmp); + nstack = 0; + } + + while ( nstack ) + { + node_t *node = stack[--nstack]; + node_t *akid = node->akid; + node_t *bkid = node->bkid; + if ( !akid ) + { + cluster = append_cluster(node, cluster, &ncluster, tmp); + continue; + } + + if ( node->value >= cutoff && akid->value < cutoff ) + cluster = append_cluster(akid, cluster, &ncluster, tmp); + else + stack[nstack++] = akid; + + if ( node->value >= cutoff && bkid->value < cutoff ) + cluster = append_cluster(bkid, cluster, &ncluster, tmp); + else + stack[nstack++] = bkid; + } + + free(tmp); + free(stack); + + *nclust = ncluster; + return cluster; +} + +void hclust_destroy_list(cluster_t *clust, int nclust) +{ + int i; + for (i=0; i + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +/* + Simple hierarchical clustering +*/ + +#ifndef __HCLUST_H__ +#define __HCLUST_H__ + +#include + +typedef struct _hclust_t hclust_t; + +typedef struct +{ + float dist; + int nmemb, *memb; +} +cluster_t; + +#define PDIST(mat,a,b) (mat)[((a)>(b)?((a)*((a)-1)/2+(b)):((b)*((b)-1)/2+(a)))] + +/* + * hclust_init() - init and run clustering + * @n: number of elements + * @pdist: pairwise distances. The array will be modified by hclust and + * must exist until hclust_destroy() is called + */ +hclust_t *hclust_init(int n, float *pdist); +void hclust_destroy(hclust_t *clust); + +/* + * hclust_create_list() - returns a list of clusters + * @min_inter_dist: minimum inter-cluster distance. If smaller, elements are considered + * homogenous, belonging to the same cluster. + * @max_intra_dist: maximum intra-cluster distance allowed. If smaller than 0, + * the threshold can be heuristically lowered, otherwise considered + * a fixed cutoff. The pointer will be filled to the cutoff actually used. + */ +cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust); +void hclust_destroy_list(cluster_t *clust, int nclust); + +/* + * Access debugging data used in the decision making process. Note that this + * must be called immediately after hclust_create_list because other calls, + * such as hclust_create_dot(), invalidate the temporary data structures. + */ +char **hclust_explain(hclust_t *clust, int *nlines); + +char *hclust_create_dot(hclust_t *clust, char **labels, float th); + +#endif + diff --git a/bcftools/kheap.h b/bcftools/kheap.h new file mode 100644 index 0000000..ac2f9f9 --- /dev/null +++ b/bcftools/kheap.h @@ -0,0 +1,171 @@ +/* The MIT License + + Copyright (C) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + Usage example: + + #include "kheap.h" + + // First we prepare the user data to store, in this example it is a + // struct with a single element "key", and a comparator function + // "is_smaller". In this example the comparator defines a min heap (as + // opposed to a max heap). + typedef struct + { + uint32_t key; + } + data_t; + static inline int is_smaller(data_t *a, data_t *b) + { + return a->key < b->key ? 1 : 0; + } + data_t data[3] = { {3}, {2}, {1} }; + + + // Heap declaration, "mh" is an arbitrary string. The typedef is not + // required, it is just a convenience shortcut so that we can use + // "heap_t" instead of the generic "khp_mh_t" automatically created by + // the KHEAP_INIT macro. + KHEAP_INIT(mh, data_t, is_smaller) + typedef khp_mh_t heap_t; + + // Initialize the heap, insert the test data, then retrieve them back, + // sorted. Multiple heaps with the same name "mh" can be created and + // used simultaneously, as long as they all use the same data type + // "data_t". + heap_t *heap = khp_init(mh); + + for (int i=0; i<3; i++) + khp_insert(mh, heap, &data[i]); + + while (heap->ndat) + { + printf("%d\n", heap->dat[0].pos); + khp_delete(mh, heap); + } + + // Clean up + khp_destroy(mh, heap); + +*/ + +#ifndef __KHEAP_H__ +#define __KHEAP_H__ + +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + + +#define __KHEAP_TYPE(name, kheap_t) \ + typedef struct { \ + int ndat, mdat; \ + kheap_t *dat; \ + kheap_t tmp; \ + } khp_##name##_t; + +#define khp_parent(i) (((i)-1)/2) +#define khp_lchild(i) (2*(i)+1) +#define khp_rchild(i) (2*(i)+2) +#define khp_swap(hp,i,j) { \ + ((hp)->tmp) = ((hp)->dat[i]); \ + ((hp)->dat[i]) = ((hp)->dat[j]); \ + ((hp)->dat[j]) = ((hp)->tmp); \ + } + +#define __KHEAP_IMPL(name, SCOPE, kheap_t, __cmp) \ + SCOPE khp_##name##_t *khp_init_##name(void) \ + { \ + return (khp_##name##_t*)calloc(1, sizeof(khp_##name##_t)); \ + } \ + SCOPE void khp_destroy_##name(khp_##name##_t *heap) \ + { \ + if (heap) free(heap->dat); \ + free(heap); \ + } \ + SCOPE int khp_insert_##name(khp_##name##_t *heap, kheap_t *dat) \ + { \ + heap->ndat++; \ + if ( heap->ndat > heap->mdat ) \ + { \ + heap->mdat = heap->ndat; \ + kroundup32(heap->mdat); \ + heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \ + } \ + int i = heap->ndat - 1; \ + while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \ + { \ + heap->dat[i] = heap->dat[khp_parent(i)]; \ + i = khp_parent(i); \ + } \ + heap->dat[i] = *dat; \ + return i; \ + } \ + SCOPE void khp_heapify_##name(khp_##name##_t *heap, int i) \ + { \ +/*todo: loop instead of a recursive function? */ \ + int extreme = khp_lchild(i) < heap->ndat && __cmp(&heap->dat[khp_lchild(i)],&heap->dat[i]) ? khp_lchild(i) : i; \ + if ( khp_rchild(i) < heap->ndat && __cmp(&heap->dat[khp_rchild(i)],&heap->dat[extreme]) ) extreme = khp_rchild(i); \ + if ( extreme != i ) \ + { \ + khp_swap(heap,i,extreme); \ + khp_heapify_##name(heap,extreme); \ + } \ + } \ + SCOPE void khp_delete_##name(khp_##name##_t *heap) \ + { \ + if ( !heap || !heap->ndat ) return; \ + heap->dat[0] = heap->dat[--heap->ndat]; \ + khp_heapify_##name(heap, 0); \ + } \ + +#define KHEAP_INIT(name, kheap_t, __cmp) \ + __KHEAP_TYPE(name, kheap_t) \ + __KHEAP_IMPL(name, static kh_inline klib_unused, kheap_t, __cmp) + +#define khp_init(name) khp_init_##name() +#define khp_destroy(name, heap) khp_destroy_##name(heap) +#define khp_insert(name, heap, dat) khp_insert_##name(heap, dat) +#define khp_delete(name, heap) khp_delete_##name(heap) + +#endif diff --git a/bcftools/main.c b/bcftools/main.c index 1892c1d..9350ff8 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -54,6 +54,8 @@ int main_polysomy(int argc, char *argv[]); #endif int main_plugin(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); +int main_csq(int argc, char *argv[]); +int bam_mpileup(int argc, char *argv[]); typedef struct { @@ -140,6 +142,10 @@ static cmd_t cmds[] = .alias = "cnv", .help = "HMM CNV calling" }, + { .func = main_csq, + .alias = "csq", + .help = "call variation consequences" + }, { .func = main_vcffilter, .alias = "filter", .help = "filter VCF/BCF files using fixed thresholds" @@ -148,6 +154,10 @@ static cmd_t cmds[] = .alias = "gtcheck", .help = "check sample concordance, detect sample swaps and contamination" }, + { .func = bam_mpileup, + .alias = "mpileup", + .help = "multi-way pileup producing genotype likelihoods" + }, #if USE_GPL { .func = main_polysomy, .alias = "polysomy", diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index f578442..a2b4a99 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -56,6 +56,8 @@ int main_polysomy(int argc, char *argv[]); #endif int main_plugin(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); +int main_csq(int argc, char *argv[]); +int bam_mpileup(int argc, char *argv[]); typedef struct { @@ -142,6 +144,10 @@ static cmd_t cmds[] = .alias = "cnv", .help = "HMM CNV calling" }, + { .func = main_csq, + .alias = "csq", + .help = "call variation consequences" + }, { .func = main_vcffilter, .alias = "filter", .help = "filter VCF/BCF files using fixed thresholds" @@ -150,6 +156,10 @@ static cmd_t cmds[] = .alias = "gtcheck", .help = "check sample concordance, detect sample swaps and contamination" }, + { .func = bam_mpileup, + .alias = "mpileup", + .help = "multi-way pileup producing genotype likelihoods" + }, #if USE_GPL { .func = main_polysomy, .alias = "polysomy", diff --git a/bcftools/mcall.c b/bcftools/mcall.c index 495f849..7f7515f 100644 --- a/bcftools/mcall.c +++ b/bcftools/mcall.c @@ -1,6 +1,6 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -107,6 +107,16 @@ int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl) // static void mcall_init_trios(call_t *call) { + if ( call->prior_AN ) + { + int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN); + if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN); + id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC); + if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC); + } + // 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250; call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64; @@ -347,8 +357,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse break; } if ( PLs[j]==bcf_int32_missing ) break; - assert( PLs[j]<256 ); - pdg[j] = pl2p[ PLs[j] ]; + pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.); sum += pdg[j]; } @@ -367,8 +376,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse { assert( PLs[j]!=bcf_int32_vector_end ); if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255; - assert( PLs[j]<256 ); - pdg[j] = pl2p[ PLs[j] ]; + pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.); sum += pdg[j]; } } @@ -539,19 +547,19 @@ float calc_HOB(int nref, int nalt, int nhets, int ndiploid) /** * log(sum_i exp(a_i)) */ -static inline double logsumexp(double *vals, int nvals) -{ - int i; - double max_exp = vals[0]; - for (i=1; itheta; // the prior - UPDATE_MAX_LKs(1<0 && lk_tot_set); } // Two alleles @@ -612,14 +620,16 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) int lk_tot_set = 0; double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]); double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]); - double fab = 2*fa*fb; fa *= fa; fb *= fb; + double fa2 = fa*fa; + double fb2 = fb*fb; + double fab = 2*fa*fb; int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; double *pdg = call->pdg; for (isample=0; isampleploidy || call->ploidy[isample]==2 ) - val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab]; + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; else if ( call->ploidy && call->ploidy[isample]==1 ) val = fa*pdg[iaa] + fb*pdg[ibb]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } @@ -627,7 +637,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; - UPDATE_MAX_LKs(1<qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); - double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc; + double fa2 = fa*fa; + double fb2 = fb*fb; + double fc2 = fc*fc; + double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; int isample, icc = (ic+1)*(ic+2)/2-1; int iac = iaa - ia + ic, ibc = ibb - ib + ic; double *pdg = call->pdg; @@ -660,7 +673,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) { double val = 0; if ( !call->ploidy || call->ploidy[isample]==2 ) - val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; else if ( call->ploidy && call->ploidy[isample]==1 ) val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } @@ -669,7 +682,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; // the prior if ( ic!=0 ) lk_tot += call->theta; // the prior - UPDATE_MAX_LKs(1<qsum[ia]*call->qsum[ia]; + double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; #if USE_PRIOR_FOR_GTS if ( ia!=0 ) lk *= prior; #endif @@ -934,7 +947,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n if ( !(out_als & 1<als_map[ia],call->als_map[ia]); - double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia]; + double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; sum_lk += lk; gls[idx] = lk; if ( best_lk < lk ) @@ -1184,82 +1197,80 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) { - int i, ret; + if ( nals==nout_als ) return; + + int i,j, nret, size = sizeof(float); + + void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point + int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs; - // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer, - // so only dealing with these cases at the moment + // INFO fields for (i=0; in_info; i++) { bcf_info_t *info = &rec->d.info[i]; int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key); - if ( vlen!=BCF_VL_R ) continue; - int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key); - if ( type!=BCF_HT_INT ) continue; + if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag - ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp); - if ( ret>0 ) + int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key); + const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key); + nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); + if ( nret<=0 ) continue; + + if ( nout_als==1 ) + bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change + else { - assert( ret==nals ); - if ( out_als==1 ) - bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1); - else + for (j=0; jals_map[j]==-1 ) continue; // to be dropped - call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point - } - bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als); + int k = call->als_map[j]; + if ( k==-1 ) continue; // to be dropped + memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size); } + bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als); } } + // FORMAT fields for (i=0; in_fmt; i++) { bcf_fmt_t *fmt = &rec->d.fmt[i]; int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id); - if ( vlen!=BCF_VL_R ) continue; + if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag + int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id); - if ( type!=BCF_HT_INT ) continue; + const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id); + nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); + if (nret<=0) continue; + int nsmpl = bcf_hdr_nsamples(call->hdr); - ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp); - if ( ret>0 ) - { - int j, nsmpl = bcf_hdr_nsamples(call->hdr); - int ndp = ret / nsmpl; - assert( ndp==nals ); - if ( out_als==1 ) - { - for (j=0; jPLs[j] = call->itmp[j*ndp]; + assert( nret==nals*nsmpl ); - bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl); - } - else + for (j=0; jPLs + j*nout_als; - int32_t *dp_src = call->itmp + j*ndp; - for (k=0; kals_map[k]==-1 ) continue; // to be dropped - dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point - } - } - bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als); + int l = call->als_map[k]; + if ( l==-1 ) continue; // to be dropped + memcpy(ptr_dst+size*l, ptr_src+size*k, size); } } + bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl); } + + call->PLs = (int32_t*) tmp_new; + call->mPLs = ntmp_new; + call->itmp = (int32_t*) tmp_ori; + call->n_itmp = ntmp_ori; } // NB: in this function we temporarily use calls->als_map for a different // purpose to store mapping from new (target) alleles to original alleles. // -static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) +static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) { bcf_sr_regions_t *tgt = call->srs->targets; if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals); @@ -1282,7 +1293,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->als[nals] = tgt->als[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); - if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]); + if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } if ( j>=0 ) { @@ -1308,7 +1319,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) nals++; } - if ( !has_new && nals==rec->n_allele ) return; + if ( !has_new && nals==rec->n_allele ) return 0; bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals); // create mapping from new PL to old PL @@ -1360,6 +1371,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) bcf_update_info_float(call->hdr, rec, "QS", qsum, nals); if ( *unseen ) *unseen = nals-1; + return 0; } @@ -1374,7 +1386,7 @@ int mcall(call_t *call, bcf1_t *rec) int i, unseen = call->unseen; // Force alleles when calling genotypes given alleles was requested - if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen); + if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; int nsmpl = bcf_hdr_nsamples(call->hdr); int nals = rec->n_allele; @@ -1395,7 +1407,7 @@ int mcall(call_t *call, bcf1_t *rec) #if QS_FROM_PDG estimate_qsum(call, rec); #else - // Get sum of qualities + // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); if ( nqs < nals ) @@ -1406,23 +1418,50 @@ int mcall(call_t *call, bcf1_t *rec) hts_expand(float,nals,call->nqsum,call->qsum); for (i=nqs; iqsum[i] = 0; } - float qsum_tot = 0; - for (i=0; iqsum[i]; - if ( !call->qsum[0] ) + + // If available, take into account reference panel AFs + if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) { - // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, - // an equivalent of a single reference read. - if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) - error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); - if ( call->itmp[0] ) + int an = call->ac[0]; + if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) { - call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; - qsum_tot += call->qsum[0]; + int ac0 = an; // number of alleles in the reference population + for (i=0; iac[i]==bcf_int32_vector_end ) break; + if ( call->ac[i]==bcf_int32_missing ) continue; + ac0 -= call->ac[i]; + call->qsum[i+1] += call->ac[i]*0.5; + } + if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); + call->qsum[0] += ac0*0.5; + for (i=0; iqsum[i] /= nsmpl + 0.5*an; } } + + float qsum_tot = 0; + for (i=0; iqsum[i]; + + // Is this still necessary?? + // + // if (0&& !call->qsum[0] ) + // { + // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, + // // an equivalent of a single reference read. + // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) + // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); + // if ( call->itmp[0] ) + // { + // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; + // qsum_tot += call->qsum[0]; + // } + // } + if ( qsum_tot ) for (i=0; iqsum[i] /= qsum_tot; #endif + bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag + // Find the best combination of alleles int out_als, nout; if ( nals > 8*sizeof(out_als) ) @@ -1497,13 +1536,17 @@ int mcall(call_t *call, bcf1_t *rec) if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1); // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set - rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum)); + rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk)); } else { // Set the quality of a REF site - rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum)); + if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + rec->qual = call->theta ? -4.343*call->theta : 0; + else + rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk)); } + if ( rec->qual>999 ) rec->qual = 999; if ( rec->qual>50 ) rec->qual = rint(rec->qual); @@ -1530,7 +1573,6 @@ int mcall(call_t *call, bcf1_t *rec) } bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag - bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag return nout; } diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c index 29ed799..a315656 100644 --- a/bcftools/mcall.c.pysam.c +++ b/bcftools/mcall.c.pysam.c @@ -2,7 +2,7 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -109,6 +109,16 @@ int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl) // static void mcall_init_trios(call_t *call) { + if ( call->prior_AN ) + { + int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN); + if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN); + id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC); + if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC); + } + // 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250; call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64; @@ -349,8 +359,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse break; } if ( PLs[j]==bcf_int32_missing ) break; - assert( PLs[j]<256 ); - pdg[j] = pl2p[ PLs[j] ]; + pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.); sum += pdg[j]; } @@ -369,8 +378,7 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse { assert( PLs[j]!=bcf_int32_vector_end ); if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255; - assert( PLs[j]<256 ); - pdg[j] = pl2p[ PLs[j] ]; + pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.); sum += pdg[j]; } } @@ -541,19 +549,19 @@ float calc_HOB(int nref, int nalt, int nhets, int ndiploid) /** * log(sum_i exp(a_i)) */ -static inline double logsumexp(double *vals, int nvals) -{ - int i; - double max_exp = vals[0]; - for (i=1; itheta; // the prior - UPDATE_MAX_LKs(1<0 && lk_tot_set); } // Two alleles @@ -614,14 +622,16 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) int lk_tot_set = 0; double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]); double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]); - double fab = 2*fa*fb; fa *= fa; fb *= fb; + double fa2 = fa*fa; + double fb2 = fb*fb; + double fab = 2*fa*fb; int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; double *pdg = call->pdg; for (isample=0; isampleploidy || call->ploidy[isample]==2 ) - val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab]; + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; else if ( call->ploidy && call->ploidy[isample]==1 ) val = fa*pdg[iaa] + fb*pdg[ibb]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } @@ -629,7 +639,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; - UPDATE_MAX_LKs(1<qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); - double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc; + double fa2 = fa*fa; + double fb2 = fb*fb; + double fc2 = fc*fc; + double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; int isample, icc = (ic+1)*(ic+2)/2-1; int iac = iaa - ia + ic, ibc = ibb - ib + ic; double *pdg = call->pdg; @@ -662,7 +675,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) { double val = 0; if ( !call->ploidy || call->ploidy[isample]==2 ) - val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; else if ( call->ploidy && call->ploidy[isample]==1 ) val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } @@ -671,7 +684,7 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; // the prior if ( ic!=0 ) lk_tot += call->theta; // the prior - UPDATE_MAX_LKs(1<qsum[ia]*call->qsum[ia]; + double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; #if USE_PRIOR_FOR_GTS if ( ia!=0 ) lk *= prior; #endif @@ -936,7 +949,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n if ( !(out_als & 1<als_map[ia],call->als_map[ia]); - double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia]; + double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; sum_lk += lk; gls[idx] = lk; if ( best_lk < lk ) @@ -1186,82 +1199,80 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) { - int i, ret; + if ( nals==nout_als ) return; + + int i,j, nret, size = sizeof(float); + + void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point + int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs; - // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer, - // so only dealing with these cases at the moment + // INFO fields for (i=0; in_info; i++) { bcf_info_t *info = &rec->d.info[i]; int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key); - if ( vlen!=BCF_VL_R ) continue; - int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key); - if ( type!=BCF_HT_INT ) continue; + if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag - ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp); - if ( ret>0 ) + int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key); + const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key); + nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); + if ( nret<=0 ) continue; + + if ( nout_als==1 ) + bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change + else { - assert( ret==nals ); - if ( out_als==1 ) - bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1); - else + for (j=0; jals_map[j]==-1 ) continue; // to be dropped - call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point - } - bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als); + int k = call->als_map[j]; + if ( k==-1 ) continue; // to be dropped + memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size); } + bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als); } } + // FORMAT fields for (i=0; in_fmt; i++) { bcf_fmt_t *fmt = &rec->d.fmt[i]; int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id); - if ( vlen!=BCF_VL_R ) continue; + if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag + int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id); - if ( type!=BCF_HT_INT ) continue; + const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id); + nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); + if (nret<=0) continue; + int nsmpl = bcf_hdr_nsamples(call->hdr); - ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp); - if ( ret>0 ) - { - int j, nsmpl = bcf_hdr_nsamples(call->hdr); - int ndp = ret / nsmpl; - assert( ndp==nals ); - if ( out_als==1 ) - { - for (j=0; jPLs[j] = call->itmp[j*ndp]; + assert( nret==nals*nsmpl ); - bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl); - } - else + for (j=0; jPLs + j*nout_als; - int32_t *dp_src = call->itmp + j*ndp; - for (k=0; kals_map[k]==-1 ) continue; // to be dropped - dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point - } - } - bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als); + int l = call->als_map[k]; + if ( l==-1 ) continue; // to be dropped + memcpy(ptr_dst+size*l, ptr_src+size*k, size); } } + bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl); } + + call->PLs = (int32_t*) tmp_new; + call->mPLs = ntmp_new; + call->itmp = (int32_t*) tmp_ori; + call->n_itmp = ntmp_ori; } // NB: in this function we temporarily use calls->als_map for a different // purpose to store mapping from new (target) alleles to original alleles. // -static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) +static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) { bcf_sr_regions_t *tgt = call->srs->targets; if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals); @@ -1284,7 +1295,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->als[nals] = tgt->als[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); - if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]); + if ( j+1==*unseen ) { fprintf(pysam_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } if ( j>=0 ) { @@ -1310,7 +1321,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) nals++; } - if ( !has_new && nals==rec->n_allele ) return; + if ( !has_new && nals==rec->n_allele ) return 0; bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals); // create mapping from new PL to old PL @@ -1362,6 +1373,7 @@ static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) bcf_update_info_float(call->hdr, rec, "QS", qsum, nals); if ( *unseen ) *unseen = nals-1; + return 0; } @@ -1376,7 +1388,7 @@ int mcall(call_t *call, bcf1_t *rec) int i, unseen = call->unseen; // Force alleles when calling genotypes given alleles was requested - if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen); + if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; int nsmpl = bcf_hdr_nsamples(call->hdr); int nals = rec->n_allele; @@ -1397,7 +1409,7 @@ int mcall(call_t *call, bcf1_t *rec) #if QS_FROM_PDG estimate_qsum(call, rec); #else - // Get sum of qualities + // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); if ( nqs < nals ) @@ -1408,23 +1420,50 @@ int mcall(call_t *call, bcf1_t *rec) hts_expand(float,nals,call->nqsum,call->qsum); for (i=nqs; iqsum[i] = 0; } - float qsum_tot = 0; - for (i=0; iqsum[i]; - if ( !call->qsum[0] ) + + // If available, take into account reference panel AFs + if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) { - // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, - // an equivalent of a single reference read. - if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) - error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); - if ( call->itmp[0] ) + int an = call->ac[0]; + if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) { - call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; - qsum_tot += call->qsum[0]; + int ac0 = an; // number of alleles in the reference population + for (i=0; iac[i]==bcf_int32_vector_end ) break; + if ( call->ac[i]==bcf_int32_missing ) continue; + ac0 -= call->ac[i]; + call->qsum[i+1] += call->ac[i]*0.5; + } + if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); + call->qsum[0] += ac0*0.5; + for (i=0; iqsum[i] /= nsmpl + 0.5*an; } } + + float qsum_tot = 0; + for (i=0; iqsum[i]; + + // Is this still necessary?? + // + // if (0&& !call->qsum[0] ) + // { + // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, + // // an equivalent of a single reference read. + // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) + // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); + // if ( call->itmp[0] ) + // { + // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; + // qsum_tot += call->qsum[0]; + // } + // } + if ( qsum_tot ) for (i=0; iqsum[i] /= qsum_tot; #endif + bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag + // Find the best combination of alleles int out_als, nout; if ( nals > 8*sizeof(out_als) ) @@ -1499,13 +1538,17 @@ int mcall(call_t *call, bcf1_t *rec) if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1); // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set - rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum)); + rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk)); } else { // Set the quality of a REF site - rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum)); + if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + rec->qual = call->theta ? -4.343*call->theta : 0; + else + rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk)); } + if ( rec->qual>999 ) rec->qual = 999; if ( rec->qual>50 ) rec->qual = rint(rec->qual); @@ -1532,7 +1575,6 @@ int mcall(call_t *call, bcf1_t *rec) } bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag - bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag return nout; } diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c new file mode 100644 index 0000000..ac37dd4 --- /dev/null +++ b/bcftools/mpileup.c @@ -0,0 +1,1110 @@ +/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools + + Copyright (C) 2008-2017 Genome Research Ltd. + Portions copyright (C) 2009-2012 Broad Institute. + + Author: Heng Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "regidx.h" +#include "bcftools.h" +#include "bam2bcf.h" +#include "bam_sample.h" +#include "gvcf.h" + +#define MPLP_BCF 1 +#define MPLP_VCF (1<<1) +#define MPLP_NO_COMP (1<<2) +#define MPLP_NO_ORPHAN (1<<3) +#define MPLP_REALN (1<<4) +#define MPLP_NO_INDEL (1<<5) +#define MPLP_REDO_BAQ (1<<6) +#define MPLP_ILLUMINA13 (1<<7) +#define MPLP_IGNORE_RG (1<<8) +#define MPLP_PRINT_POS (1<<9) +#define MPLP_PRINT_MAPQ (1<<10) +#define MPLP_PER_SAMPLE (1<<11) +#define MPLP_SMART_OVERLAPS (1<<12) + +typedef struct _mplp_aux_t mplp_aux_t; +typedef struct _mplp_pileup_t mplp_pileup_t; + +// Data shared by all bam files +typedef struct { + int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int rflag_require, rflag_filter, output_type; + int openQ, extQ, tandemQ, min_support; // for indels + double min_frac; // for indels + char *reg_fname, *pl_list, *fai_fname, *output_fname; + int reg_is_file, record_cmd_line, n_threads; + faidx_t *fai; + regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions + regitr_t *bed_itr, *reg_itr; + int bed_logic; // 1: include region, 0: exclude region + gvcf_t *gvcf; + + // auxiliary structures for calling + bcf_callaux_t *bca; + bcf_callret1_t *bcr; + bcf_call_t bc; + bam_mplp_t iter; + mplp_aux_t **mplp_data; + int nfiles; + char **files; + mplp_pileup_t *gplp; + int *n_plp; + const bam_pileup1_t **plp; + bam_smpl_t *bsmpl; + kstring_t buf; + bcf1_t *bcf_rec; + htsFile *bcf_fp; + bcf_hdr_t *bcf_hdr; + int argc; + char **argv; +} mplp_conf_t; + +typedef struct { + char *ref[2]; + int ref_id[2]; + int ref_len[2]; +} mplp_ref_t; + +#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}} + +// Data specific to each bam file +struct _mplp_aux_t { + samFile *fp; + hts_itr_t *iter; + bam_hdr_t *h; + mplp_ref_t *ref; + const mplp_conf_t *conf; + int bam_id; + hts_idx_t *idx; // maintained only with more than one -r regions +}; + +// Data passed to htslib/mpileup +struct _mplp_pileup_t { + int n; + int *n_plp, *m_plp; + bam_pileup1_t **plp; +}; + +static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { + mplp_ref_t *r = ma->ref; + + //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); + + if (!r || !ma->conf->fai) { + *ref = NULL; + return 0; + } + + // Do we need to reference count this so multiple mplp_aux_t can + // track which references are in use? + // For now we just cache the last two. Sufficient? + if (tid == r->ref_id[0]) { + *ref = r->ref[0]; + *ref_len = r->ref_len[0]; + return 1; + } + if (tid == r->ref_id[1]) { + // Last, swap over + int tmp; + tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; + tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; + + char *tc; + tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; + *ref = r->ref[0]; + *ref_len = r->ref_len[0]; + return 1; + } + + // New, so migrate to old and load new + free(r->ref[1]); + r->ref[1] = r->ref[0]; + r->ref_id[1] = r->ref_id[0]; + r->ref_len[1] = r->ref_len[0]; + + r->ref_id[0] = tid; + r->ref[0] = faidx_fetch_seq(ma->conf->fai, + ma->h->target_name[r->ref_id[0]], + 0, + INT_MAX, + &r->ref_len[0]); + + if (!r->ref[0]) { + r->ref[0] = NULL; + r->ref_id[0] = -1; + r->ref_len[0] = 0; + *ref = NULL; + return 0; + } + + *ref = r->ref[0]; + *ref_len = r->ref_len[0]; + return 1; +} + +static int mplp_func(void *data, bam1_t *b) +{ + char *ref; + mplp_aux_t *ma = (mplp_aux_t*)data; + int ret, ref_len; + while (1) + { + int has_ref; + ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); + if (ret < 0) break; + // The 'B' cigar operation is not part of the specification, considering as obsolete. + // bam_remove_B(b); + if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads + if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue; + if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue; + if (ma->conf->bed) + { + // test overlap + regitr_t *itr = ma->conf->bed_itr; + int beg = b->core.pos, end = bam_endpos(b)-1; + int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr); + if ( !ma->conf->bed_logic && !overlap ) + { + // exclude only reads which are fully contained in the region + while ( regitr_overlap(itr) ) + { + if ( beg < itr->beg ) { overlap = 1; break; } + if ( end > itr->end ) { overlap = 1; break; } + } + } + if ( !overlap ) continue; + } + if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue; + if (ma->conf->flag & MPLP_ILLUMINA13) { + int i; + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + qual[i] = qual[i] > 31? qual[i] - 31 : 0; + } + + if (ma->conf->fai && b->core.tid >= 0) { + has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); + if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence + fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", + __func__, b->core.pos, ref_len, b->core.tid); + continue; + } + } else { + has_ref = 0; + } + + if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + if (has_ref && ma->conf->capQ_thres > 10) { + int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); + if (q < 0) continue; // skip + else if (b->core.qual > q) b->core.qual = q; + } + if (b->core.qual < ma->conf->min_mq) continue; + else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue; + + return ret; + }; + return ret; +} + +// Called once per new bam added to the pileup. +// We cache sample information here so we don't have to keep recomputing this +// on each and every pileup column. +// +// Cd is an arbitrary block of data we can write into, which ends up in +// the pileup structures. We stash the sample ID there. +static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + mplp_aux_t *ma = (mplp_aux_t *)data; + cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); + return 0; +} + +static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp) +{ + int i, j; + memset(m->n_plp, 0, m->n * sizeof(int)); + for (i = 0; i < n; ++i) // iterate over all bams + { + for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position + { + const bam_pileup1_t *p = plp[i] + j; + int id = p->cd.i; + if (m->n_plp[id] == m->m_plp[id]) + { + m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; + m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); + } + m->plp[id][m->n_plp[id]++] = *p; + } + } +} + +static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec) +{ + if ( !conf->gvcf ) + { + if ( rec ) bcf_write1(fp, hdr, rec); + return; + } + + if ( !rec ) + { + gvcf_write(conf->gvcf, fp, hdr, NULL, 0); + return; + } + + int is_ref = 0; + if ( rec->n_allele==1 ) is_ref = 1; + else if ( rec->n_allele==2 ) + { + // second allele is mpileup's X, not a variant + if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; + } + rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); + if ( rec ) bcf_write1(fp,hdr,rec); +} + +static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) +{ + bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list + + int ret, i, tid, pos, ref_len; + char *ref; + + while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) + { + if ( end && (posend) ) continue; + if ( conf->bed && tid >= 0 ) + { + int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL); + if ( !conf->bed_logic ) overlap = overlap ? 0 : 1; + if ( !overlap ) continue; + } + mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + + int total_depth, _ref0, ref16; + for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; + group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp); + _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; + ref16 = seq_nt16_table[_ref0]; + bcf_callaux_clean(conf->bca, &conf->bc); + for (i = 0; i < conf->gplp->n; ++i) + bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i); + conf->bc.tid = tid; conf->bc.pos = pos; + bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); + bcf_clear1(conf->bcf_rec); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0); + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + + // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? + // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth + && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0) + { + bcf_callaux_clean(conf->bca, &conf->bc); + for (i = 0; i < conf->gplp->n; ++i) + bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); + if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + { + bcf_clear1(conf->bcf_rec); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + } + } + } + return 0; +} + +static int mpileup(mplp_conf_t *conf) +{ + if (conf->nfiles == 0) { + fprintf(stderr,"[%s] no input file/data given\n", __func__); + exit(EXIT_FAILURE); + } + + mplp_ref_t mp_ref = MPLP_REF_INIT; + conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t)); + conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*)); + conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*)); + conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int)); + + // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index + // must be kept in the memory for the whole time which can be a problem with many bams. + // Therefore if none or only one region is requested, we initialize the bam iterator as + // before and free the index. Only when multiple regions are queried, we keep the index. + int nregs = 0; + if ( conf->reg_fname ) + { + if ( conf->reg_is_file ) + { + conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); + if ( !conf->reg ) { + fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); + exit(EXIT_FAILURE); + } + } + else + { + conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); + if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { + fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); + exit(EXIT_FAILURE); + } + } + nregs = regidx_nregs(conf->reg); + conf->reg_itr = regitr_init(conf->reg); + regitr_loop(conf->reg_itr); // region iterator now positioned at the first region + } + + // read the header of each file in the list and initialize data + // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least! + bam_hdr_t *hdr = NULL; // header of first file in input list + int i; + for (i = 0; i < conf->nfiles; ++i) { + bam_hdr_t *h_tmp; + conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t)); + conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); + if ( !conf->mplp_data[i]->fp ) + { + fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); + exit(EXIT_FAILURE); + } + if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + exit(EXIT_FAILURE); + } + if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { + fprintf(stderr, "[%s] failed to process %s: %s\n", + __func__, conf->fai_fname, strerror(errno)); + exit(EXIT_FAILURE); + } + conf->mplp_data[i]->conf = conf; + conf->mplp_data[i]->ref = &mp_ref; + h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); + if ( !h_tmp ) { + fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); + exit(EXIT_FAILURE); + } + conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet + conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); + if ( conf->mplp_data[i]->bam_id<0 ) + { + // no usable readgroups in this bam, it can be skipped + sam_close(conf->mplp_data[i]->fp); + free(conf->mplp_data[i]); + bam_hdr_destroy(h_tmp); + free(conf->files[i]); + if ( i+1nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1)); + conf->nfiles--; + i--; + continue; + } + if (conf->reg) { + hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); + if (idx == NULL) { + fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); + exit(EXIT_FAILURE); + } + conf->buf.l = 0; + ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); + conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); + if ( !conf->mplp_data[i]->iter ) + { + conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); + if ( conf->mplp_data[i]->iter ) { + fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); + exit(EXIT_FAILURE); + } + fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); + exit(EXIT_FAILURE); + } + if ( nregs==1 ) // no need to keep the index in memory + hts_idx_destroy(idx); + else + conf->mplp_data[i]->idx = idx; + } + + if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */ + else { + // FIXME: check consistency between h and h_tmp + bam_hdr_destroy(h_tmp); + + // we store only the first file's header; it's (alleged to be) + // compatible with the i-th file's target_name lookup needs + conf->mplp_data[i]->h = hdr; + } + } + // allocate data storage proportionate to number of samples being studied sm->n + bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); + conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); + conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); + conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); + + fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); + // write the VCF header + conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); + if (conf->bcf_fp == NULL) { + fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); + exit(EXIT_FAILURE); + } + if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); + + // BCF header creation + conf->bcf_hdr = bcf_hdr_init("w"); + conf->buf.l = 0; + + if (conf->record_cmd_line) + { + ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + + conf->buf.l = 0; + ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); + for (i=1; iargc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); + kputc('\n', &conf->buf); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + } + + if (conf->fai_fname) + { + conf->buf.l = 0; + ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + } + + // Translate BAM @SQ tags to BCF ##contig tags + // todo: use/write new BAM header manipulation routines, fill also UR, M5 + for (i=0; in_targets; i++) + { + conf->buf.l = 0; + ksprintf(&conf->buf, "##contig=", hdr->target_name[i], hdr->target_len[i]); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + } + conf->buf.l = 0; + + bcf_hdr_append(conf->bcf_hdr,"##ALT="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); +#if CDF_MWU_TESTS + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); +#endif + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_DP ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_DV ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_DPR ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_INFO_DPR ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_FMT_DP4 ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_SP ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_AD ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_ADF ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_ADR ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_INFO_AD ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_ADF ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_ADR ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->gvcf ) + gvcf_update_header(conf->gvcf, conf->bcf_hdr); + + int nsmpl; + const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); + for (i=0; ibcf_hdr, smpl[i]); + bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); + + conf->bca = bcf_call_init(-1., conf->min_baseQ); + conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); + conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; + conf->bca->min_frac = conf->min_frac; + conf->bca->min_support = conf->min_support; + conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; + + conf->bc.bcf_hdr = conf->bcf_hdr; + conf->bc.n = nsmpl; + conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); + if (conf->fmt_flag) + { + assert( sizeof(float)==sizeof(int32_t) ); + conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4); + conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32 + if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) + { + // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample + conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); + conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); + for (i=0; ibcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES; + conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; + } + } + } + + // init mpileup + conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); + if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); + if ( (double)conf->max_depth * conf->nfiles > 1<<20) + fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); + if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) + fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); + bam_mplp_set_maxcnt(conf->iter, conf->max_depth); + conf->max_indel_depth = conf->max_indel_depth * nsmpl; + conf->bcf_rec = bcf_init1(); + bam_mplp_constructor(conf->iter, pileup_constructor); + + // Run mpileup for multiple regions + if ( nregs ) + { + int ireg = 0; + do + { + // first region is already positioned + if ( ireg++ > 0 ) + { + conf->buf.l = 0; + ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); + + for (i=0; infiles; i++) + { + hts_itr_destroy(conf->mplp_data[i]->iter); + conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); + if ( !conf->mplp_data[i]->iter ) + { + conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); + if ( conf->mplp_data[i]->iter ) { + fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); + exit(EXIT_FAILURE); + } + fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); + exit(EXIT_FAILURE); + } + bam_mplp_reset(conf->iter); + } + } + mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); + } + while ( regitr_loop(conf->reg_itr) ); + } + else + mpileup_reg(conf,0,0); + + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); + + // clean up + free(conf->bc.tmp.s); + bcf_destroy1(conf->bcf_rec); + if (conf->bcf_fp) + { + hts_close(conf->bcf_fp); + bcf_hdr_destroy(conf->bcf_hdr); + bcf_call_destroy(conf->bca); + free(conf->bc.PL); + free(conf->bc.DP4); + free(conf->bc.ADR); + free(conf->bc.ADF); + free(conf->bc.fmt_arr); + free(conf->bcr); + } + if ( conf->gvcf ) gvcf_destroy(conf->gvcf); + free(conf->buf.s); + for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]); + free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp); + bam_mplp_destroy(conf->iter); + bam_hdr_destroy(hdr); + for (i = 0; i < conf->nfiles; ++i) { + if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx); + sam_close(conf->mplp_data[i]->fp); + if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter); + free(conf->mplp_data[i]); + } + if ( conf->reg_itr ) regitr_destroy(conf->reg_itr); + free(conf->mplp_data); free(conf->plp); free(conf->n_plp); + free(mp_ref.ref[0]); + free(mp_ref.ref[1]); + return 0; +} + +static int is_url(const char *s) +{ + static const char uri_scheme_chars[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; + return s[strspn(s, uri_scheme_chars)] == ':'; +} + +#define MAX_PATH_LEN 1024 +int read_file_list(const char *file_list,int *n,char **argv[]) +{ + char buf[MAX_PATH_LEN]; + int len, nfiles = 0; + char **files = NULL; + struct stat sb; + + *n = 0; + *argv = NULL; + + FILE *fh = fopen(file_list,"r"); + if ( !fh ) + { + fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); + return 1; + } + + files = (char**) calloc(nfiles,sizeof(char*)); + nfiles = 0; + while ( fgets(buf,MAX_PATH_LEN,fh) ) + { + // allow empty lines and trailing spaces + len = strlen(buf); + while ( len>0 && isspace(buf[len-1]) ) len--; + if ( !len ) continue; + + // check sanity of the file list + buf[len] = 0; + if (! (is_url(buf) || stat(buf, &sb) == 0)) + { + // no such file, check if it is safe to print its name + int i, safe_to_print = 1; + for (i=0; irflag_require); + char *tmp_filter = bam_flag2str(mplp->rflag_filter); + + // Display usage information, formatted for the standard 80 columns. + // (The unusual string formatting here aids the readability of this + // source code in 80 columns, to the extent that's possible.) + + fprintf(fp, +"\n" +"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" +"\n" +"Input options:\n" +" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" +" -A, --count-orphans do not discard anomalous read pairs\n" +" -b, --bam-list FILE list of input BAM filenames, one per line\n" +" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" +" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" +" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + fprintf(fp, +" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" +" -f, --fasta-ref FILE faidx indexed reference sequence file\n" +" --no-reference do not require fasta reference file\n" +" -G, --read-groups FILE select or exclude read groups listed in the file\n" +" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + fprintf(fp, +" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + fprintf(fp, +" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" +" -R, --regions-file FILE restrict to regions listed in a file\n" +" --ignore-RG ignore RG tags (one BAM = one sample)\n" +" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + fprintf(fp, +" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" +" [%s]\n", tmp_filter); + fprintf(fp, +" -s, --samples LIST comma separated list of samples to include\n" +" -S, --samples-file FILE file of samples to include\n" +" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" +" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +" -x, --ignore-overlaps disable read-pair overlap detection\n" +"\n" +"Output options:\n" +" -a, --annotate LIST optional tags to output; '?' to list []\n" +" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" +" to minimum per-sample DP\n" +" --no-version do not append version and command line to the header\n" +" -o, --output FILE write output to FILE [standard output]\n" +" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" +" 'z' compressed VCF; 'v' uncompressed VCF [v]\n" +" --threads INT number of extra output compression threads [0]\n" +"\n" +"SNP/INDEL genotype likelihoods options:\n" +" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + fprintf(fp, +" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); + fprintf(fp, +" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + fprintf(fp, +" -I, --skip-indels do not perform indel calling\n" +" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + fprintf(fp, +" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + fprintf(fp, +" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + fprintf(fp, +" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" +" -P, --platforms STR comma separated list of platforms for indels [all]\n" +"\n" +"Notes: Assuming diploid individuals.\n" +"\n"); + + free(tmp_require); + free(tmp_filter); +} + +int bam_mpileup(int argc, char *argv[]) +{ + int c; + const char *file_list = NULL; + char **fn = NULL; + int nfiles = 0, use_orphan = 0, noref = 0; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + mplp.min_baseQ = 13; + mplp.capQ_thres = 0; + mplp.max_depth = 250; mplp.max_indel_depth = 250; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; + mplp.min_frac = 0.002; mplp.min_support = 1; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; + mplp.argc = argc; mplp.argv = argv; + mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.output_fname = NULL; + mplp.output_type = FT_VCF; + mplp.record_cmd_line = 1; + mplp.n_threads = 0; + mplp.bsmpl = bam_smpl_init(); + + static const struct option lopts[] = + { + {"rf", required_argument, NULL, 1}, // require flag + {"ff", required_argument, NULL, 2}, // filter flag + {"incl-flags", required_argument, NULL, 1}, + {"excl-flags", required_argument, NULL, 2}, + {"output", required_argument, NULL, 3}, + {"open-prob", required_argument, NULL, 4}, + {"ignore-RG", no_argument, NULL, 5}, + {"ignore-rg", no_argument, NULL, 5}, + {"gvcf", required_argument, NULL, 'g'}, + {"non-reference", no_argument, NULL, 7}, + {"no-version", no_argument, NULL, 8}, + {"threads",required_argument,NULL,9}, + {"illumina1.3+", no_argument, NULL, '6'}, + {"count-orphans", no_argument, NULL, 'A'}, + {"bam-list", required_argument, NULL, 'b'}, + {"no-BAQ", no_argument, NULL, 'B'}, + {"no-baq", no_argument, NULL, 'B'}, + {"adjust-MQ", required_argument, NULL, 'C'}, + {"adjust-mq", required_argument, NULL, 'C'}, + {"max-depth", required_argument, NULL, 'd'}, + {"redo-BAQ", no_argument, NULL, 'E'}, + {"redo-baq", no_argument, NULL, 'E'}, + {"fasta-ref", required_argument, NULL, 'f'}, + {"read-groups", required_argument, NULL, 'G'}, + {"region", required_argument, NULL, 'r'}, + {"regions", required_argument, NULL, 'r'}, + {"regions-file", required_argument, NULL, 'R'}, + {"targets", required_argument, NULL, 't'}, + {"targets-file", required_argument, NULL, 'T'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-BQ", required_argument, NULL, 'Q'}, + {"min-bq", required_argument, NULL, 'Q'}, + {"ignore-overlaps", no_argument, NULL, 'x'}, + {"output-type", required_argument, NULL, 'O'}, + {"samples", required_argument, NULL, 's'}, + {"samples-file", required_argument, NULL, 'S'}, + {"annotate", required_argument, NULL, 'a'}, + {"ext-prob", required_argument, NULL, 'e'}, + {"gap-frac", required_argument, NULL, 'F'}, + {"tandem-qual", required_argument, NULL, 'h'}, + {"skip-indels", no_argument, NULL, 'I'}, + {"max-idepth", required_argument, NULL, 'L'}, + {"min-ireads ", required_argument, NULL, 'm'}, + {"per-sample-mF", no_argument, NULL, 'p'}, + {"per-sample-mf", no_argument, NULL, 'p'}, + {"platforms", required_argument, NULL, 'P'}, + {NULL, 0, NULL, 0} + }; + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { + switch (c) { + case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; + case 1 : + mplp.rflag_require = bam_str2flag(optarg); + if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } + break; + case 2 : + mplp.rflag_filter = bam_str2flag(optarg); + if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } + break; + case 3 : mplp.output_fname = optarg; break; + case 4 : mplp.openQ = atoi(optarg); break; + case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break; + case 'g': + mplp.gvcf = gvcf_init(optarg); + if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg); + break; + case 'f': + mplp.fai = fai_load(optarg); + if (mplp.fai == NULL) return 1; + mplp.fai_fname = optarg; + break; + case 7 : noref = 1; break; + case 8 : mplp.record_cmd_line = 0; break; + case 9 : mplp.n_threads = strtol(optarg, 0, 0); break; + case 'd': mplp.max_depth = atoi(optarg); break; + case 'r': mplp.reg_fname = strdup(optarg); break; + case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break; + case 't': + // In the original version the whole BAM was streamed which is inefficient + // with few BED intervals and big BAMs. Todo: devise a heuristic to determine + // best strategy, that is streaming or jumping. + if ( optarg[0]=='^' ) optarg++; + else mplp.bed_logic = 1; + mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); + mplp.bed_itr = regitr_init(mplp.bed); + if ( regidx_insert_list(mplp.bed,optarg,',') !=0 ) + { + fprintf(stderr,"Could not parse the targets: %s\n", optarg); + exit(EXIT_FAILURE); + } + break; + case 'T': + if ( optarg[0]=='^' ) optarg++; + else mplp.bed_logic = 1; + mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL); + if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; } + break; + case 'P': mplp.pl_list = strdup(optarg); break; + case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; + case 'B': mplp.flag &= ~MPLP_REALN; break; + case 'I': mplp.flag |= MPLP_NO_INDEL; break; + case 'E': mplp.flag |= MPLP_REDO_BAQ; break; + case '6': mplp.flag |= MPLP_ILLUMINA13; break; + case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break; + case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break; + case 'O': + switch (optarg[0]) { + case 'b': mplp.output_type = FT_BCF_GZ; break; + case 'u': mplp.output_type = FT_BCF; break; + case 'z': mplp.output_type = FT_VCF_GZ; break; + case 'v': mplp.output_type = FT_VCF; break; + default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n", optarg); + } + break; + case 'C': mplp.capQ_thres = atoi(optarg); break; + case 'q': mplp.min_mq = atoi(optarg); break; + case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 'b': file_list = optarg; break; + case 'o': { + char *end; + long value = strtol(optarg, &end, 10); + // Distinguish between -o INT and -o FILE (a bit of a hack!) + if (*end == '\0') mplp.openQ = value; + else mplp.output_fname = optarg; + } + break; + case 'e': mplp.extQ = atoi(optarg); break; + case 'h': mplp.tandemQ = atoi(optarg); break; + case 'A': use_orphan = 1; break; + case 'F': mplp.min_frac = atof(optarg); break; + case 'm': mplp.min_support = atoi(optarg); break; + case 'L': mplp.max_indel_depth = atoi(optarg); break; + case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break; + case 'a': + if (optarg[0]=='?') { + list_annotations(stderr); + return 1; + } + mplp.fmt_flag |= parse_format_flag(optarg); + break; + default: + fprintf(stderr,"Invalid option: '%c'\n", c); + return 1; + } + } + + if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) + { + fprintf(stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); + mplp.fmt_flag |= B2B_FMT_DP; + } + if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) + { + if ( mplp.flag&MPLP_VCF ) + { + if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; + else mplp.output_type = FT_VCF_GZ; + } + else if ( mplp.flag&MPLP_BCF ) + { + if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; + else mplp.output_type = FT_BCF_GZ; + } + } + if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) + { + fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); + return 1; + } + if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; + if (argc == 1) + { + print_usage(stderr, &mplp); + return 1; + } + if (!mplp.fai && !noref) { + fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); + return 1; + } + int ret,i; + if (file_list) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + mplp.files = fn; + mplp.nfiles = nfiles; + } + else + { + mplp.nfiles = argc - optind; + mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*)); + for (i=0; i + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "regidx.h" +#include "bcftools.h" +#include "bam2bcf.h" +#include "bam_sample.h" +#include "gvcf.h" + +#define MPLP_BCF 1 +#define MPLP_VCF (1<<1) +#define MPLP_NO_COMP (1<<2) +#define MPLP_NO_ORPHAN (1<<3) +#define MPLP_REALN (1<<4) +#define MPLP_NO_INDEL (1<<5) +#define MPLP_REDO_BAQ (1<<6) +#define MPLP_ILLUMINA13 (1<<7) +#define MPLP_IGNORE_RG (1<<8) +#define MPLP_PRINT_POS (1<<9) +#define MPLP_PRINT_MAPQ (1<<10) +#define MPLP_PER_SAMPLE (1<<11) +#define MPLP_SMART_OVERLAPS (1<<12) + +typedef struct _mplp_aux_t mplp_aux_t; +typedef struct _mplp_pileup_t mplp_pileup_t; + +// Data shared by all bam files +typedef struct { + int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int rflag_require, rflag_filter, output_type; + int openQ, extQ, tandemQ, min_support; // for indels + double min_frac; // for indels + char *reg_fname, *pl_list, *fai_fname, *output_fname; + int reg_is_file, record_cmd_line, n_threads; + faidx_t *fai; + regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions + regitr_t *bed_itr, *reg_itr; + int bed_logic; // 1: include region, 0: exclude region + gvcf_t *gvcf; + + // auxiliary structures for calling + bcf_callaux_t *bca; + bcf_callret1_t *bcr; + bcf_call_t bc; + bam_mplp_t iter; + mplp_aux_t **mplp_data; + int nfiles; + char **files; + mplp_pileup_t *gplp; + int *n_plp; + const bam_pileup1_t **plp; + bam_smpl_t *bsmpl; + kstring_t buf; + bcf1_t *bcf_rec; + htsFile *bcf_fp; + bcf_hdr_t *bcf_hdr; + int argc; + char **argv; +} mplp_conf_t; + +typedef struct { + char *ref[2]; + int ref_id[2]; + int ref_len[2]; +} mplp_ref_t; + +#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}} + +// Data specific to each bam file +struct _mplp_aux_t { + samFile *fp; + hts_itr_t *iter; + bam_hdr_t *h; + mplp_ref_t *ref; + const mplp_conf_t *conf; + int bam_id; + hts_idx_t *idx; // maintained only with more than one -r regions +}; + +// Data passed to htslib/mpileup +struct _mplp_pileup_t { + int n; + int *n_plp, *m_plp; + bam_pileup1_t **plp; +}; + +static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { + mplp_ref_t *r = ma->ref; + + //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); + + if (!r || !ma->conf->fai) { + *ref = NULL; + return 0; + } + + // Do we need to reference count this so multiple mplp_aux_t can + // track which references are in use? + // For now we just cache the last two. Sufficient? + if (tid == r->ref_id[0]) { + *ref = r->ref[0]; + *ref_len = r->ref_len[0]; + return 1; + } + if (tid == r->ref_id[1]) { + // Last, swap over + int tmp; + tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; + tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; + + char *tc; + tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; + *ref = r->ref[0]; + *ref_len = r->ref_len[0]; + return 1; + } + + // New, so migrate to old and load new + free(r->ref[1]); + r->ref[1] = r->ref[0]; + r->ref_id[1] = r->ref_id[0]; + r->ref_len[1] = r->ref_len[0]; + + r->ref_id[0] = tid; + r->ref[0] = faidx_fetch_seq(ma->conf->fai, + ma->h->target_name[r->ref_id[0]], + 0, + INT_MAX, + &r->ref_len[0]); + + if (!r->ref[0]) { + r->ref[0] = NULL; + r->ref_id[0] = -1; + r->ref_len[0] = 0; + *ref = NULL; + return 0; + } + + *ref = r->ref[0]; + *ref_len = r->ref_len[0]; + return 1; +} + +static int mplp_func(void *data, bam1_t *b) +{ + char *ref; + mplp_aux_t *ma = (mplp_aux_t*)data; + int ret, ref_len; + while (1) + { + int has_ref; + ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); + if (ret < 0) break; + // The 'B' cigar operation is not part of the specification, considering as obsolete. + // bam_remove_B(b); + if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads + if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue; + if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue; + if (ma->conf->bed) + { + // test overlap + regitr_t *itr = ma->conf->bed_itr; + int beg = b->core.pos, end = bam_endpos(b)-1; + int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr); + if ( !ma->conf->bed_logic && !overlap ) + { + // exclude only reads which are fully contained in the region + while ( regitr_overlap(itr) ) + { + if ( beg < itr->beg ) { overlap = 1; break; } + if ( end > itr->end ) { overlap = 1; break; } + } + } + if ( !overlap ) continue; + } + if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue; + if (ma->conf->flag & MPLP_ILLUMINA13) { + int i; + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + qual[i] = qual[i] > 31? qual[i] - 31 : 0; + } + + if (ma->conf->fai && b->core.tid >= 0) { + has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); + if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence + fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", + __func__, b->core.pos, ref_len, b->core.tid); + continue; + } + } else { + has_ref = 0; + } + + if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + if (has_ref && ma->conf->capQ_thres > 10) { + int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); + if (q < 0) continue; // skip + else if (b->core.qual > q) b->core.qual = q; + } + if (b->core.qual < ma->conf->min_mq) continue; + else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue; + + return ret; + }; + return ret; +} + +// Called once per new bam added to the pileup. +// We cache sample information here so we don't have to keep recomputing this +// on each and every pileup column. +// +// Cd is an arbitrary block of data we can write into, which ends up in +// the pileup structures. We stash the sample ID there. +static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + mplp_aux_t *ma = (mplp_aux_t *)data; + cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); + return 0; +} + +static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp) +{ + int i, j; + memset(m->n_plp, 0, m->n * sizeof(int)); + for (i = 0; i < n; ++i) // iterate over all bams + { + for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position + { + const bam_pileup1_t *p = plp[i] + j; + int id = p->cd.i; + if (m->n_plp[id] == m->m_plp[id]) + { + m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; + m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); + } + m->plp[id][m->n_plp[id]++] = *p; + } + } +} + +static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec) +{ + if ( !conf->gvcf ) + { + if ( rec ) bcf_write1(fp, hdr, rec); + return; + } + + if ( !rec ) + { + gvcf_write(conf->gvcf, fp, hdr, NULL, 0); + return; + } + + int is_ref = 0; + if ( rec->n_allele==1 ) is_ref = 1; + else if ( rec->n_allele==2 ) + { + // second allele is mpileup's X, not a variant + if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; + } + rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); + if ( rec ) bcf_write1(fp,hdr,rec); +} + +static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) +{ + bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list + + int ret, i, tid, pos, ref_len; + char *ref; + + while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) + { + if ( end && (posend) ) continue; + if ( conf->bed && tid >= 0 ) + { + int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL); + if ( !conf->bed_logic ) overlap = overlap ? 0 : 1; + if ( !overlap ) continue; + } + mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + + int total_depth, _ref0, ref16; + for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; + group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp); + _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; + ref16 = seq_nt16_table[_ref0]; + bcf_callaux_clean(conf->bca, &conf->bc); + for (i = 0; i < conf->gplp->n; ++i) + bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i); + conf->bc.tid = tid; conf->bc.pos = pos; + bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); + bcf_clear1(conf->bcf_rec); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0); + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + + // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? + // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth + && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0) + { + bcf_callaux_clean(conf->bca, &conf->bc); + for (i = 0; i < conf->gplp->n; ++i) + bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); + if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + { + bcf_clear1(conf->bcf_rec); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + } + } + } + return 0; +} + +static int mpileup(mplp_conf_t *conf) +{ + if (conf->nfiles == 0) { + fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__); + exit(EXIT_FAILURE); + } + + mplp_ref_t mp_ref = MPLP_REF_INIT; + conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t)); + conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*)); + conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*)); + conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int)); + + // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index + // must be kept in the memory for the whole time which can be a problem with many bams. + // Therefore if none or only one region is requested, we initialize the bam iterator as + // before and free the index. Only when multiple regions are queried, we keep the index. + int nregs = 0; + if ( conf->reg_fname ) + { + if ( conf->reg_is_file ) + { + conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); + if ( !conf->reg ) { + fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname); + exit(EXIT_FAILURE); + } + } + else + { + conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); + if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { + fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname); + exit(EXIT_FAILURE); + } + } + nregs = regidx_nregs(conf->reg); + conf->reg_itr = regitr_init(conf->reg); + regitr_loop(conf->reg_itr); // region iterator now positioned at the first region + } + + // read the header of each file in the list and initialize data + // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least! + bam_hdr_t *hdr = NULL; // header of first file in input list + int i; + for (i = 0; i < conf->nfiles; ++i) { + bam_hdr_t *h_tmp; + conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t)); + conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); + if ( !conf->mplp_data[i]->fp ) + { + fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); + exit(EXIT_FAILURE); + } + if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + exit(EXIT_FAILURE); + } + if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { + fprintf(pysam_stderr, "[%s] failed to process %s: %s\n", + __func__, conf->fai_fname, strerror(errno)); + exit(EXIT_FAILURE); + } + conf->mplp_data[i]->conf = conf; + conf->mplp_data[i]->ref = &mp_ref; + h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); + if ( !h_tmp ) { + fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); + exit(EXIT_FAILURE); + } + conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet + conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); + if ( conf->mplp_data[i]->bam_id<0 ) + { + // no usable readgroups in this bam, it can be skipped + sam_close(conf->mplp_data[i]->fp); + free(conf->mplp_data[i]); + bam_hdr_destroy(h_tmp); + free(conf->files[i]); + if ( i+1nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1)); + conf->nfiles--; + i--; + continue; + } + if (conf->reg) { + hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); + if (idx == NULL) { + fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); + exit(EXIT_FAILURE); + } + conf->buf.l = 0; + ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); + conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); + if ( !conf->mplp_data[i]->iter ) + { + conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); + if ( conf->mplp_data[i]->iter ) { + fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); + exit(EXIT_FAILURE); + } + fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); + exit(EXIT_FAILURE); + } + if ( nregs==1 ) // no need to keep the index in memory + hts_idx_destroy(idx); + else + conf->mplp_data[i]->idx = idx; + } + + if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */ + else { + // FIXME: check consistency between h and h_tmp + bam_hdr_destroy(h_tmp); + + // we store only the first file's header; it's (alleged to be) + // compatible with the i-th file's target_name lookup needs + conf->mplp_data[i]->h = hdr; + } + } + // allocate data storage proportionate to number of samples being studied sm->n + bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); + conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); + conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); + conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); + + fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); + // write the VCF header + conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); + if (conf->bcf_fp == NULL) { + fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); + exit(EXIT_FAILURE); + } + if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); + + // BCF header creation + conf->bcf_hdr = bcf_hdr_init("w"); + conf->buf.l = 0; + + if (conf->record_cmd_line) + { + ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + + conf->buf.l = 0; + ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); + for (i=1; iargc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); + kputc('\n', &conf->buf); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + } + + if (conf->fai_fname) + { + conf->buf.l = 0; + ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + } + + // Translate BAM @SQ tags to BCF ##contig tags + // todo: use/write new BAM header manipulation routines, fill also UR, M5 + for (i=0; in_targets; i++) + { + conf->buf.l = 0; + ksprintf(&conf->buf, "##contig=", hdr->target_name[i], hdr->target_len[i]); + bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + } + conf->buf.l = 0; + + bcf_hdr_append(conf->bcf_hdr,"##ALT="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); +#if CDF_MWU_TESTS + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); +#endif + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_DP ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_DV ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_DPR ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_INFO_DPR ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_FMT_DP4 ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_SP ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_AD ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_ADF ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_ADR ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_INFO_AD ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_ADF ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_ADR ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->gvcf ) + gvcf_update_header(conf->gvcf, conf->bcf_hdr); + + int nsmpl; + const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); + for (i=0; ibcf_hdr, smpl[i]); + bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); + + conf->bca = bcf_call_init(-1., conf->min_baseQ); + conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); + conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; + conf->bca->min_frac = conf->min_frac; + conf->bca->min_support = conf->min_support; + conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; + + conf->bc.bcf_hdr = conf->bcf_hdr; + conf->bc.n = nsmpl; + conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); + if (conf->fmt_flag) + { + assert( sizeof(float)==sizeof(int32_t) ); + conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4); + conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32 + if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) + { + // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample + conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); + conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); + for (i=0; ibcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES; + conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; + } + } + } + + // init mpileup + conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); + if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); + if ( (double)conf->max_depth * conf->nfiles > 1<<20) + fprintf(pysam_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); + if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) + fprintf(pysam_stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); + bam_mplp_set_maxcnt(conf->iter, conf->max_depth); + conf->max_indel_depth = conf->max_indel_depth * nsmpl; + conf->bcf_rec = bcf_init1(); + bam_mplp_constructor(conf->iter, pileup_constructor); + + // Run mpileup for multiple regions + if ( nregs ) + { + int ireg = 0; + do + { + // first region is already positioned + if ( ireg++ > 0 ) + { + conf->buf.l = 0; + ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); + + for (i=0; infiles; i++) + { + hts_itr_destroy(conf->mplp_data[i]->iter); + conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); + if ( !conf->mplp_data[i]->iter ) + { + conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); + if ( conf->mplp_data[i]->iter ) { + fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); + exit(EXIT_FAILURE); + } + fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); + exit(EXIT_FAILURE); + } + bam_mplp_reset(conf->iter); + } + } + mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); + } + while ( regitr_loop(conf->reg_itr) ); + } + else + mpileup_reg(conf,0,0); + + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); + + // clean up + free(conf->bc.tmp.s); + bcf_destroy1(conf->bcf_rec); + if (conf->bcf_fp) + { + hts_close(conf->bcf_fp); + bcf_hdr_destroy(conf->bcf_hdr); + bcf_call_destroy(conf->bca); + free(conf->bc.PL); + free(conf->bc.DP4); + free(conf->bc.ADR); + free(conf->bc.ADF); + free(conf->bc.fmt_arr); + free(conf->bcr); + } + if ( conf->gvcf ) gvcf_destroy(conf->gvcf); + free(conf->buf.s); + for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]); + free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp); + bam_mplp_destroy(conf->iter); + bam_hdr_destroy(hdr); + for (i = 0; i < conf->nfiles; ++i) { + if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx); + sam_close(conf->mplp_data[i]->fp); + if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter); + free(conf->mplp_data[i]); + } + if ( conf->reg_itr ) regitr_destroy(conf->reg_itr); + free(conf->mplp_data); free(conf->plp); free(conf->n_plp); + free(mp_ref.ref[0]); + free(mp_ref.ref[1]); + return 0; +} + +static int is_url(const char *s) +{ + static const char uri_scheme_chars[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; + return s[strspn(s, uri_scheme_chars)] == ':'; +} + +#define MAX_PATH_LEN 1024 +int read_file_list(const char *file_list,int *n,char **argv[]) +{ + char buf[MAX_PATH_LEN]; + int len, nfiles = 0; + char **files = NULL; + struct stat sb; + + *n = 0; + *argv = NULL; + + FILE *fh = fopen(file_list,"r"); + if ( !fh ) + { + fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno)); + return 1; + } + + files = (char**) calloc(nfiles,sizeof(char*)); + nfiles = 0; + while ( fgets(buf,MAX_PATH_LEN,fh) ) + { + // allow empty lines and trailing spaces + len = strlen(buf); + while ( len>0 && isspace(buf[len-1]) ) len--; + if ( !len ) continue; + + // check sanity of the file list + buf[len] = 0; + if (! (is_url(buf) || stat(buf, &sb) == 0)) + { + // no such file, check if it is safe to print its name + int i, safe_to_print = 1; + for (i=0; irflag_require); + char *tmp_filter = bam_flag2str(mplp->rflag_filter); + + // Display usage information, formatted for the standard 80 columns. + // (The unusual string formatting here aids the readability of this + // source code in 80 columns, to the extent that's possible.) + + fprintf(fp, +"\n" +"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" +"\n" +"Input options:\n" +" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" +" -A, --count-orphans do not discard anomalous read pairs\n" +" -b, --bam-list FILE list of input BAM filenames, one per line\n" +" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" +" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" +" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + fprintf(fp, +" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" +" -f, --fasta-ref FILE faidx indexed reference sequence file\n" +" --no-reference do not require fasta reference file\n" +" -G, --read-groups FILE select or exclude read groups listed in the file\n" +" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + fprintf(fp, +" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + fprintf(fp, +" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" +" -R, --regions-file FILE restrict to regions listed in a file\n" +" --ignore-RG ignore RG tags (one BAM = one sample)\n" +" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + fprintf(fp, +" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" +" [%s]\n", tmp_filter); + fprintf(fp, +" -s, --samples LIST comma separated list of samples to include\n" +" -S, --samples-file FILE file of samples to include\n" +" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" +" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +" -x, --ignore-overlaps disable read-pair overlap detection\n" +"\n" +"Output options:\n" +" -a, --annotate LIST optional tags to output; '?' to list []\n" +" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" +" to minimum per-sample DP\n" +" --no-version do not append version and command line to the header\n" +" -o, --output FILE write output to FILE [standard output]\n" +" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" +" 'z' compressed VCF; 'v' uncompressed VCF [v]\n" +" --threads INT number of extra output compression threads [0]\n" +"\n" +"SNP/INDEL genotype likelihoods options:\n" +" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + fprintf(fp, +" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); + fprintf(fp, +" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + fprintf(fp, +" -I, --skip-indels do not perform indel calling\n" +" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + fprintf(fp, +" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + fprintf(fp, +" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + fprintf(fp, +" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" +" -P, --platforms STR comma separated list of platforms for indels [all]\n" +"\n" +"Notes: Assuming diploid individuals.\n" +"\n"); + + free(tmp_require); + free(tmp_filter); +} + +int bam_mpileup(int argc, char *argv[]) +{ + int c; + const char *file_list = NULL; + char **fn = NULL; + int nfiles = 0, use_orphan = 0, noref = 0; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + mplp.min_baseQ = 13; + mplp.capQ_thres = 0; + mplp.max_depth = 250; mplp.max_indel_depth = 250; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; + mplp.min_frac = 0.002; mplp.min_support = 1; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; + mplp.argc = argc; mplp.argv = argv; + mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.output_fname = NULL; + mplp.output_type = FT_VCF; + mplp.record_cmd_line = 1; + mplp.n_threads = 0; + mplp.bsmpl = bam_smpl_init(); + + static const struct option lopts[] = + { + {"rf", required_argument, NULL, 1}, // require flag + {"ff", required_argument, NULL, 2}, // filter flag + {"incl-flags", required_argument, NULL, 1}, + {"excl-flags", required_argument, NULL, 2}, + {"output", required_argument, NULL, 3}, + {"open-prob", required_argument, NULL, 4}, + {"ignore-RG", no_argument, NULL, 5}, + {"ignore-rg", no_argument, NULL, 5}, + {"gvcf", required_argument, NULL, 'g'}, + {"non-reference", no_argument, NULL, 7}, + {"no-version", no_argument, NULL, 8}, + {"threads",required_argument,NULL,9}, + {"illumina1.3+", no_argument, NULL, '6'}, + {"count-orphans", no_argument, NULL, 'A'}, + {"bam-list", required_argument, NULL, 'b'}, + {"no-BAQ", no_argument, NULL, 'B'}, + {"no-baq", no_argument, NULL, 'B'}, + {"adjust-MQ", required_argument, NULL, 'C'}, + {"adjust-mq", required_argument, NULL, 'C'}, + {"max-depth", required_argument, NULL, 'd'}, + {"redo-BAQ", no_argument, NULL, 'E'}, + {"redo-baq", no_argument, NULL, 'E'}, + {"fasta-ref", required_argument, NULL, 'f'}, + {"read-groups", required_argument, NULL, 'G'}, + {"region", required_argument, NULL, 'r'}, + {"regions", required_argument, NULL, 'r'}, + {"regions-file", required_argument, NULL, 'R'}, + {"targets", required_argument, NULL, 't'}, + {"targets-file", required_argument, NULL, 'T'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-BQ", required_argument, NULL, 'Q'}, + {"min-bq", required_argument, NULL, 'Q'}, + {"ignore-overlaps", no_argument, NULL, 'x'}, + {"output-type", required_argument, NULL, 'O'}, + {"samples", required_argument, NULL, 's'}, + {"samples-file", required_argument, NULL, 'S'}, + {"annotate", required_argument, NULL, 'a'}, + {"ext-prob", required_argument, NULL, 'e'}, + {"gap-frac", required_argument, NULL, 'F'}, + {"tandem-qual", required_argument, NULL, 'h'}, + {"skip-indels", no_argument, NULL, 'I'}, + {"max-idepth", required_argument, NULL, 'L'}, + {"min-ireads ", required_argument, NULL, 'm'}, + {"per-sample-mF", no_argument, NULL, 'p'}, + {"per-sample-mf", no_argument, NULL, 'p'}, + {"platforms", required_argument, NULL, 'P'}, + {NULL, 0, NULL, 0} + }; + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { + switch (c) { + case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; + case 1 : + mplp.rflag_require = bam_str2flag(optarg); + if ( mplp.rflag_require<0 ) { fprintf(pysam_stderr,"Could not parse --rf %s\n", optarg); return 1; } + break; + case 2 : + mplp.rflag_filter = bam_str2flag(optarg); + if ( mplp.rflag_filter<0 ) { fprintf(pysam_stderr,"Could not parse --ff %s\n", optarg); return 1; } + break; + case 3 : mplp.output_fname = optarg; break; + case 4 : mplp.openQ = atoi(optarg); break; + case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break; + case 'g': + mplp.gvcf = gvcf_init(optarg); + if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg); + break; + case 'f': + mplp.fai = fai_load(optarg); + if (mplp.fai == NULL) return 1; + mplp.fai_fname = optarg; + break; + case 7 : noref = 1; break; + case 8 : mplp.record_cmd_line = 0; break; + case 9 : mplp.n_threads = strtol(optarg, 0, 0); break; + case 'd': mplp.max_depth = atoi(optarg); break; + case 'r': mplp.reg_fname = strdup(optarg); break; + case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break; + case 't': + // In the original version the whole BAM was streamed which is inefficient + // with few BED intervals and big BAMs. Todo: devise a heuristic to determine + // best strategy, that is streaming or jumping. + if ( optarg[0]=='^' ) optarg++; + else mplp.bed_logic = 1; + mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); + mplp.bed_itr = regitr_init(mplp.bed); + if ( regidx_insert_list(mplp.bed,optarg,',') !=0 ) + { + fprintf(pysam_stderr,"Could not parse the targets: %s\n", optarg); + exit(EXIT_FAILURE); + } + break; + case 'T': + if ( optarg[0]=='^' ) optarg++; + else mplp.bed_logic = 1; + mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL); + if (!mplp.bed) { fprintf(pysam_stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; } + break; + case 'P': mplp.pl_list = strdup(optarg); break; + case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; + case 'B': mplp.flag &= ~MPLP_REALN; break; + case 'I': mplp.flag |= MPLP_NO_INDEL; break; + case 'E': mplp.flag |= MPLP_REDO_BAQ; break; + case '6': mplp.flag |= MPLP_ILLUMINA13; break; + case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break; + case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break; + case 'O': + switch (optarg[0]) { + case 'b': mplp.output_type = FT_BCF_GZ; break; + case 'u': mplp.output_type = FT_BCF; break; + case 'z': mplp.output_type = FT_VCF_GZ; break; + case 'v': mplp.output_type = FT_VCF; break; + default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n", optarg); + } + break; + case 'C': mplp.capQ_thres = atoi(optarg); break; + case 'q': mplp.min_mq = atoi(optarg); break; + case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 'b': file_list = optarg; break; + case 'o': { + char *end; + long value = strtol(optarg, &end, 10); + // Distinguish between -o INT and -o FILE (a bit of a hack!) + if (*end == '\0') mplp.openQ = value; + else mplp.output_fname = optarg; + } + break; + case 'e': mplp.extQ = atoi(optarg); break; + case 'h': mplp.tandemQ = atoi(optarg); break; + case 'A': use_orphan = 1; break; + case 'F': mplp.min_frac = atof(optarg); break; + case 'm': mplp.min_support = atoi(optarg); break; + case 'L': mplp.max_indel_depth = atoi(optarg); break; + case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break; + case 'a': + if (optarg[0]=='?') { + list_annotations(pysam_stderr); + return 1; + } + mplp.fmt_flag |= parse_format_flag(optarg); + break; + default: + fprintf(pysam_stderr,"Invalid option: '%c'\n", c); + return 1; + } + } + + if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) + { + fprintf(pysam_stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); + mplp.fmt_flag |= B2B_FMT_DP; + } + if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) + { + if ( mplp.flag&MPLP_VCF ) + { + if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; + else mplp.output_type = FT_VCF_GZ; + } + else if ( mplp.flag&MPLP_BCF ) + { + if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; + else mplp.output_type = FT_BCF_GZ; + } + } + if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) + { + fprintf(pysam_stderr,"Error: The -B option cannot be combined with -E\n"); + return 1; + } + if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; + if (argc == 1) + { + print_usage(pysam_stderr, &mplp); + return 1; + } + if (!mplp.fai && !noref) { + fprintf(pysam_stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); + return 1; + } + int ret,i; + if (file_list) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + mplp.files = fn; + mplp.nfiles = nfiles; + } + else + { + mplp.nfiles = argc - optind; + mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*)); + for (i=0; i + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +// Code to build this table is below +#ifdef BUILD_MW +#include + +double mann_whitney_1947(int n, int m, int U) +{ + if (U<0) return 0; + if (n==0||m==0) return U==0 ? 1 : 0; + return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U); +} + +int main(void) { + int i, j, k; + printf("static double mw[6][6][50] = // [2-7][2-7][0-49]\n{\n"); + for (i = 2; i < 8; i++) { + printf(" {\n"); + for (j = 2; j < 8; j++) { + printf(" {\n"); + for (k = 0; k < 50; k++) { + printf(" %.17f,\n", mann_whitney_1947(i,j,k)); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + return 0; +} +#endif + +static double mw[6][6][50] = // [2-7][2-7][0-49] +{ + { + { + 0.16666666666666666, + 0.16666666666666666, + 0.33333333333333331, + 0.16666666666666666, + 0.16666666666666666, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.09999999999999999, + 0.09999999999999999, + 0.19999999999999998, + 0.20000000000000001, + 0.20000000000000001, + 0.10000000000000001, + 0.10000000000000001, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.06666666666666665, + 0.06666666666666665, + 0.13333333333333330, + 0.13333333333333333, + 0.20000000000000001, + 0.13333333333333333, + 0.13333333333333333, + 0.06666666666666667, + 0.06666666666666667, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.04761904761904761, + 0.04761904761904761, + 0.09523809523809522, + 0.09523809523809523, + 0.14285714285714288, + 0.14285714285714285, + 0.14285714285714285, + 0.09523809523809523, + 0.09523809523809523, + 0.04761904761904762, + 0.04761904761904762, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.03571428571428571, + 0.03571428571428571, + 0.07142857142857141, + 0.07142857142857142, + 0.10714285714285715, + 0.10714285714285714, + 0.14285714285714285, + 0.10714285714285715, + 0.10714285714285715, + 0.07142857142857144, + 0.07142857142857142, + 0.03571428571428571, + 0.03571428571428571, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.02777777777777777, + 0.02777777777777777, + 0.05555555555555555, + 0.05555555555555555, + 0.08333333333333334, + 0.08333333333333333, + 0.11111111111111110, + 0.11111111111111113, + 0.11111111111111113, + 0.08333333333333334, + 0.08333333333333334, + 0.05555555555555556, + 0.05555555555555555, + 0.02777777777777778, + 0.02777777777777778, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + }, + { + { + 0.10000000000000001, + 0.10000000000000001, + 0.20000000000000001, + 0.20000000000000001, + 0.19999999999999998, + 0.09999999999999999, + 0.09999999999999999, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.05000000000000000, + 0.05000000000000000, + 0.10000000000000001, + 0.14999999999999999, + 0.14999999999999999, + 0.14999999999999999, + 0.14999999999999999, + 0.10000000000000001, + 0.05000000000000000, + 0.05000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.02857142857142857, + 0.02857142857142857, + 0.05714285714285714, + 0.08571428571428570, + 0.11428571428571427, + 0.11428571428571427, + 0.14285714285714282, + 0.11428571428571428, + 0.11428571428571428, + 0.08571428571428572, + 0.05714285714285714, + 0.02857142857142857, + 0.02857142857142857, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.01785714285714286, + 0.01785714285714286, + 0.03571428571428571, + 0.05357142857142856, + 0.07142857142857142, + 0.08928571428571427, + 0.10714285714285711, + 0.10714285714285712, + 0.10714285714285714, + 0.10714285714285715, + 0.08928571428571427, + 0.07142857142857142, + 0.05357142857142857, + 0.03571428571428571, + 0.01785714285714286, + 0.01785714285714286, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.01190476190476190, + 0.01190476190476190, + 0.02380952380952381, + 0.03571428571428571, + 0.04761904761904762, + 0.05952380952380951, + 0.08333333333333330, + 0.08333333333333331, + 0.09523809523809523, + 0.09523809523809523, + 0.09523809523809523, + 0.08333333333333333, + 0.08333333333333333, + 0.05952380952380952, + 0.04761904761904762, + 0.03571428571428571, + 0.02380952380952381, + 0.01190476190476190, + 0.01190476190476190, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00833333333333333, + 0.00833333333333333, + 0.01666666666666666, + 0.02499999999999999, + 0.03333333333333333, + 0.04166666666666666, + 0.05833333333333331, + 0.06666666666666665, + 0.07499999999999998, + 0.08333333333333331, + 0.08333333333333331, + 0.08333333333333333, + 0.08333333333333333, + 0.07500000000000000, + 0.06666666666666667, + 0.05833333333333333, + 0.04166666666666666, + 0.03333333333333333, + 0.02500000000000000, + 0.01666666666666667, + 0.00833333333333333, + 0.00833333333333333, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + }, + { + { + 0.06666666666666667, + 0.06666666666666667, + 0.13333333333333333, + 0.13333333333333333, + 0.20000000000000001, + 0.13333333333333333, + 0.13333333333333330, + 0.06666666666666665, + 0.06666666666666665, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.02857142857142857, + 0.02857142857142857, + 0.05714285714285714, + 0.08571428571428572, + 0.11428571428571428, + 0.11428571428571428, + 0.14285714285714282, + 0.11428571428571427, + 0.11428571428571427, + 0.08571428571428570, + 0.05714285714285714, + 0.02857142857142857, + 0.02857142857142857, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.01428571428571429, + 0.01428571428571429, + 0.02857142857142857, + 0.04285714285714286, + 0.07142857142857142, + 0.07142857142857142, + 0.09999999999999998, + 0.09999999999999998, + 0.11428571428571427, + 0.09999999999999998, + 0.09999999999999998, + 0.07142857142857142, + 0.07142857142857142, + 0.04285714285714286, + 0.02857142857142857, + 0.01428571428571429, + 0.01428571428571429, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00793650793650794, + 0.00793650793650794, + 0.01587301587301587, + 0.02380952380952381, + 0.03968253968253968, + 0.04761904761904762, + 0.06349206349206349, + 0.07142857142857142, + 0.08730158730158730, + 0.08730158730158730, + 0.09523809523809522, + 0.08730158730158728, + 0.08730158730158730, + 0.07142857142857142, + 0.06349206349206349, + 0.04761904761904761, + 0.03968253968253968, + 0.02380952380952381, + 0.01587301587301587, + 0.00793650793650794, + 0.00793650793650794, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00476190476190476, + 0.00476190476190476, + 0.00952380952380952, + 0.01428571428571429, + 0.02380952380952381, + 0.02857142857142857, + 0.04285714285714286, + 0.04761904761904762, + 0.06190476190476190, + 0.06666666666666665, + 0.07619047619047617, + 0.07619047619047617, + 0.08571428571428569, + 0.07619047619047617, + 0.07619047619047620, + 0.06666666666666667, + 0.06190476190476191, + 0.04761904761904762, + 0.04285714285714286, + 0.02857142857142857, + 0.02380952380952381, + 0.01428571428571429, + 0.00952380952380952, + 0.00476190476190476, + 0.00476190476190476, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00303030303030303, + 0.00303030303030303, + 0.00606060606060606, + 0.00909090909090909, + 0.01515151515151515, + 0.01818181818181818, + 0.02727272727272727, + 0.03333333333333333, + 0.04242424242424242, + 0.04848484848484847, + 0.05757575757575756, + 0.06060606060606059, + 0.06969696969696967, + 0.06969696969696967, + 0.07272727272727272, + 0.06969696969696969, + 0.06969696969696970, + 0.06060606060606059, + 0.05757575757575757, + 0.04848484848484848, + 0.04242424242424242, + 0.03333333333333333, + 0.02727272727272727, + 0.01818181818181818, + 0.01515151515151515, + 0.00909090909090909, + 0.00606060606060606, + 0.00303030303030303, + 0.00303030303030303, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + }, + { + { + 0.04761904761904762, + 0.04761904761904762, + 0.09523809523809523, + 0.09523809523809523, + 0.14285714285714285, + 0.14285714285714285, + 0.14285714285714288, + 0.09523809523809523, + 0.09523809523809522, + 0.04761904761904761, + 0.04761904761904761, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.01785714285714286, + 0.01785714285714286, + 0.03571428571428571, + 0.05357142857142857, + 0.07142857142857142, + 0.08928571428571427, + 0.10714285714285715, + 0.10714285714285714, + 0.10714285714285712, + 0.10714285714285711, + 0.08928571428571427, + 0.07142857142857142, + 0.05357142857142856, + 0.03571428571428571, + 0.01785714285714286, + 0.01785714285714286, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00793650793650794, + 0.00793650793650794, + 0.01587301587301587, + 0.02380952380952381, + 0.03968253968253968, + 0.04761904761904761, + 0.06349206349206349, + 0.07142857142857142, + 0.08730158730158730, + 0.08730158730158728, + 0.09523809523809522, + 0.08730158730158730, + 0.08730158730158730, + 0.07142857142857142, + 0.06349206349206349, + 0.04761904761904762, + 0.03968253968253968, + 0.02380952380952381, + 0.01587301587301587, + 0.00793650793650794, + 0.00793650793650794, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00396825396825397, + 0.00396825396825397, + 0.00793650793650794, + 0.01190476190476190, + 0.01984126984126984, + 0.02777777777777777, + 0.03571428571428571, + 0.04365079365079365, + 0.05555555555555555, + 0.06349206349206349, + 0.07142857142857142, + 0.07539682539682539, + 0.07936507936507936, + 0.07936507936507936, + 0.07539682539682539, + 0.07142857142857142, + 0.06349206349206349, + 0.05555555555555555, + 0.04365079365079365, + 0.03571428571428571, + 0.02777777777777777, + 0.01984126984126984, + 0.01190476190476190, + 0.00793650793650794, + 0.00396825396825397, + 0.00396825396825397, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00216450216450216, + 0.00216450216450216, + 0.00432900432900433, + 0.00649350649350649, + 0.01082251082251082, + 0.01515151515151515, + 0.02164502164502164, + 0.02597402597402597, + 0.03463203463203463, + 0.04112554112554112, + 0.04978354978354978, + 0.05411255411255411, + 0.06277056277056275, + 0.06493506493506493, + 0.06926406926406925, + 0.06926406926406925, + 0.06926406926406925, + 0.06493506493506492, + 0.06277056277056275, + 0.05411255411255410, + 0.04978354978354978, + 0.04112554112554112, + 0.03463203463203463, + 0.02597402597402597, + 0.02164502164502164, + 0.01515151515151515, + 0.01082251082251082, + 0.00649350649350649, + 0.00432900432900433, + 0.00216450216450216, + 0.00216450216450216, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00126262626262626, + 0.00126262626262626, + 0.00252525252525253, + 0.00378787878787879, + 0.00631313131313131, + 0.00883838383838384, + 0.01262626262626262, + 0.01641414141414141, + 0.02146464646464646, + 0.02651515151515151, + 0.03282828282828283, + 0.03787878787878787, + 0.04419191919191919, + 0.04924242424242424, + 0.05429292929292929, + 0.05808080808080808, + 0.06060606060606059, + 0.06186868686868686, + 0.06186868686868686, + 0.06060606060606059, + 0.05808080808080807, + 0.05429292929292930, + 0.04924242424242424, + 0.04419191919191920, + 0.03787878787878787, + 0.03282828282828282, + 0.02651515151515152, + 0.02146464646464646, + 0.01641414141414142, + 0.01262626262626263, + 0.00883838383838384, + 0.00631313131313131, + 0.00378787878787879, + 0.00252525252525253, + 0.00126262626262626, + 0.00126262626262626, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + }, + { + { + 0.03571428571428571, + 0.03571428571428571, + 0.07142857142857142, + 0.07142857142857144, + 0.10714285714285715, + 0.10714285714285715, + 0.14285714285714285, + 0.10714285714285714, + 0.10714285714285715, + 0.07142857142857142, + 0.07142857142857141, + 0.03571428571428571, + 0.03571428571428571, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.01190476190476190, + 0.01190476190476190, + 0.02380952380952381, + 0.03571428571428571, + 0.04761904761904762, + 0.05952380952380952, + 0.08333333333333333, + 0.08333333333333333, + 0.09523809523809523, + 0.09523809523809523, + 0.09523809523809523, + 0.08333333333333331, + 0.08333333333333330, + 0.05952380952380951, + 0.04761904761904762, + 0.03571428571428571, + 0.02380952380952381, + 0.01190476190476190, + 0.01190476190476190, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00476190476190476, + 0.00476190476190476, + 0.00952380952380952, + 0.01428571428571429, + 0.02380952380952381, + 0.02857142857142857, + 0.04285714285714286, + 0.04761904761904762, + 0.06190476190476191, + 0.06666666666666667, + 0.07619047619047620, + 0.07619047619047617, + 0.08571428571428569, + 0.07619047619047617, + 0.07619047619047617, + 0.06666666666666665, + 0.06190476190476190, + 0.04761904761904762, + 0.04285714285714286, + 0.02857142857142857, + 0.02380952380952381, + 0.01428571428571429, + 0.00952380952380952, + 0.00476190476190476, + 0.00476190476190476, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00216450216450216, + 0.00216450216450216, + 0.00432900432900433, + 0.00649350649350649, + 0.01082251082251082, + 0.01515151515151515, + 0.02164502164502164, + 0.02597402597402597, + 0.03463203463203463, + 0.04112554112554112, + 0.04978354978354978, + 0.05411255411255410, + 0.06277056277056275, + 0.06493506493506492, + 0.06926406926406925, + 0.06926406926406925, + 0.06926406926406925, + 0.06493506493506493, + 0.06277056277056275, + 0.05411255411255411, + 0.04978354978354978, + 0.04112554112554112, + 0.03463203463203463, + 0.02597402597402597, + 0.02164502164502164, + 0.01515151515151515, + 0.01082251082251082, + 0.00649350649350649, + 0.00432900432900433, + 0.00216450216450216, + 0.00216450216450216, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00108225108225108, + 0.00108225108225108, + 0.00216450216450216, + 0.00324675324675325, + 0.00541125541125541, + 0.00757575757575758, + 0.01190476190476190, + 0.01406926406926407, + 0.01948051948051948, + 0.02380952380952381, + 0.03030303030303030, + 0.03463203463203463, + 0.04220779220779219, + 0.04545454545454544, + 0.05194805194805194, + 0.05519480519480519, + 0.05952380952380951, + 0.05952380952380952, + 0.06277056277056275, + 0.05952380952380952, + 0.05952380952380951, + 0.05519480519480519, + 0.05194805194805194, + 0.04545454545454544, + 0.04220779220779219, + 0.03463203463203463, + 0.03030303030303030, + 0.02380952380952381, + 0.01948051948051948, + 0.01406926406926407, + 0.01190476190476190, + 0.00757575757575758, + 0.00541125541125541, + 0.00324675324675325, + 0.00216450216450216, + 0.00108225108225108, + 0.00108225108225108, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00058275058275058, + 0.00058275058275058, + 0.00116550116550117, + 0.00174825174825175, + 0.00291375291375291, + 0.00407925407925408, + 0.00641025641025641, + 0.00815850815850816, + 0.01107226107226107, + 0.01398601398601398, + 0.01806526806526806, + 0.02156177156177156, + 0.02680652680652679, + 0.03030303030303030, + 0.03554778554778554, + 0.03962703962703962, + 0.04428904428904428, + 0.04720279720279720, + 0.05128205128205127, + 0.05244755244755244, + 0.05477855477855477, + 0.05477855477855477, + 0.05477855477855477, + 0.05244755244755243, + 0.05128205128205127, + 0.04720279720279720, + 0.04428904428904428, + 0.03962703962703962, + 0.03554778554778555, + 0.03030303030303030, + 0.02680652680652681, + 0.02156177156177156, + 0.01806526806526806, + 0.01398601398601399, + 0.01107226107226107, + 0.00815850815850816, + 0.00641025641025641, + 0.00407925407925408, + 0.00291375291375291, + 0.00174825174825175, + 0.00116550116550117, + 0.00058275058275058, + 0.00058275058275058, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + }, + { + { + 0.02777777777777778, + 0.02777777777777778, + 0.05555555555555555, + 0.05555555555555556, + 0.08333333333333334, + 0.08333333333333334, + 0.11111111111111113, + 0.11111111111111113, + 0.11111111111111110, + 0.08333333333333333, + 0.08333333333333334, + 0.05555555555555555, + 0.05555555555555555, + 0.02777777777777777, + 0.02777777777777777, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00833333333333333, + 0.00833333333333333, + 0.01666666666666667, + 0.02500000000000000, + 0.03333333333333333, + 0.04166666666666666, + 0.05833333333333333, + 0.06666666666666667, + 0.07500000000000000, + 0.08333333333333333, + 0.08333333333333333, + 0.08333333333333331, + 0.08333333333333331, + 0.07499999999999998, + 0.06666666666666665, + 0.05833333333333331, + 0.04166666666666666, + 0.03333333333333333, + 0.02499999999999999, + 0.01666666666666666, + 0.00833333333333333, + 0.00833333333333333, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00303030303030303, + 0.00303030303030303, + 0.00606060606060606, + 0.00909090909090909, + 0.01515151515151515, + 0.01818181818181818, + 0.02727272727272727, + 0.03333333333333333, + 0.04242424242424242, + 0.04848484848484848, + 0.05757575757575757, + 0.06060606060606059, + 0.06969696969696970, + 0.06969696969696969, + 0.07272727272727272, + 0.06969696969696967, + 0.06969696969696967, + 0.06060606060606059, + 0.05757575757575756, + 0.04848484848484847, + 0.04242424242424242, + 0.03333333333333333, + 0.02727272727272727, + 0.01818181818181818, + 0.01515151515151515, + 0.00909090909090909, + 0.00606060606060606, + 0.00303030303030303, + 0.00303030303030303, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00126262626262626, + 0.00126262626262626, + 0.00252525252525253, + 0.00378787878787879, + 0.00631313131313131, + 0.00883838383838384, + 0.01262626262626263, + 0.01641414141414142, + 0.02146464646464646, + 0.02651515151515152, + 0.03282828282828282, + 0.03787878787878787, + 0.04419191919191920, + 0.04924242424242424, + 0.05429292929292930, + 0.05808080808080807, + 0.06060606060606059, + 0.06186868686868686, + 0.06186868686868686, + 0.06060606060606059, + 0.05808080808080808, + 0.05429292929292929, + 0.04924242424242424, + 0.04419191919191919, + 0.03787878787878787, + 0.03282828282828283, + 0.02651515151515151, + 0.02146464646464646, + 0.01641414141414141, + 0.01262626262626262, + 0.00883838383838384, + 0.00631313131313131, + 0.00378787878787879, + 0.00252525252525253, + 0.00126262626262626, + 0.00126262626262626, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00058275058275058, + 0.00058275058275058, + 0.00116550116550117, + 0.00174825174825175, + 0.00291375291375291, + 0.00407925407925408, + 0.00641025641025641, + 0.00815850815850816, + 0.01107226107226107, + 0.01398601398601399, + 0.01806526806526806, + 0.02156177156177156, + 0.02680652680652681, + 0.03030303030303030, + 0.03554778554778555, + 0.03962703962703962, + 0.04428904428904428, + 0.04720279720279720, + 0.05128205128205127, + 0.05244755244755243, + 0.05477855477855477, + 0.05477855477855477, + 0.05477855477855477, + 0.05244755244755244, + 0.05128205128205127, + 0.04720279720279720, + 0.04428904428904428, + 0.03962703962703962, + 0.03554778554778554, + 0.03030303030303030, + 0.02680652680652679, + 0.02156177156177156, + 0.01806526806526806, + 0.01398601398601398, + 0.01107226107226107, + 0.00815850815850816, + 0.00641025641025641, + 0.00407925407925408, + 0.00291375291375291, + 0.00174825174825175, + 0.00116550116550117, + 0.00058275058275058, + 0.00058275058275058, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + 0.00000000000000000, + }, + { + 0.00029137529137529, + 0.00029137529137529, + 0.00058275058275058, + 0.00087412587412587, + 0.00145687645687646, + 0.00203962703962704, + 0.00320512820512821, + 0.00437062937062937, + 0.00582750582750583, + 0.00757575757575758, + 0.00990675990675991, + 0.01223776223776224, + 0.01544289044289044, + 0.01835664335664336, + 0.02185314685314686, + 0.02534965034965035, + 0.02913752913752913, + 0.03263403263403263, + 0.03642191142191141, + 0.03962703962703962, + 0.04254079254079253, + 0.04516317016317015, + 0.04720279720279719, + 0.04836829836829836, + 0.04924242424242423, + 0.04924242424242423, + 0.04836829836829836, + 0.04720279720279719, + 0.04516317016317015, + 0.04254079254079253, + 0.03962703962703962, + 0.03642191142191141, + 0.03263403263403263, + 0.02913752913752913, + 0.02534965034965035, + 0.02185314685314686, + 0.01835664335664336, + 0.01544289044289044, + 0.01223776223776224, + 0.00990675990675991, + 0.00757575757575758, + 0.00582750582750583, + 0.00437062937062937, + 0.00320512820512821, + 0.00203962703962704, + 0.00145687645687646, + 0.00087412587412587, + 0.00058275058275058, + 0.00029137529137529, + 0.00029137529137529, + }, + }, +}; diff --git a/bcftools/ploidy.c b/bcftools/ploidy.c index 719e175..550ba87 100644 --- a/bcftools/ploidy.c +++ b/bcftools/ploidy.c @@ -1,4 +1,4 @@ -/* +/* Copyright (C) 2014-2016 Genome Research Ltd. Author: Petr Danecek @@ -22,7 +22,6 @@ THE SOFTWARE. */ -#include #include #include #include @@ -35,6 +34,7 @@ struct _ploidy_t int dflt, min, max; // ploidy: default, min and max (only explicitly listed) int *sex2dflt; regidx_t *idx; + regitr_t *itr; void *sex2id; char **id2sex; kstring_t tmp_str; @@ -52,7 +52,7 @@ regidx_t *ploidy_regions(ploidy_t *ploidy) return ploidy->idx; } -int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) { int i, ret; ploidy_t *ploidy = (ploidy_t*) usr; @@ -68,7 +68,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v else { // Fill CHR,FROM,TO - ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL); + ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL); if ( ret!=0 ) return ret; } @@ -144,6 +144,7 @@ ploidy_t *ploidy_init(const char *fname, int dflt) ploidy_destroy(pld); return NULL; } + pld->itr = regitr_init(pld->idx); _set_defaults(pld,dflt); return pld; } @@ -156,6 +157,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) pld->min = pld->max = -1; pld->sex2id = khash_str2int_init(); pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld); + pld->itr = regitr_init(pld->idx); kstring_t tmp = {0,0,0}; const char *ss = str; @@ -170,7 +172,6 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) while ( *se && isspace(*se) ) se++; ss = se; } - regidx_insert(pld->idx,NULL); free(tmp.s); _set_defaults(pld,dflt); @@ -180,6 +181,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) void ploidy_destroy(ploidy_t *ploidy) { if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id); + if ( ploidy->itr ) regitr_destroy(ploidy->itr); if ( ploidy->idx ) regidx_destroy(ploidy->idx); free(ploidy->id2sex); free(ploidy->tmp_str.s); @@ -189,8 +191,7 @@ void ploidy_destroy(ploidy_t *ploidy) int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max) { - regitr_t itr; - int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr); + int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr); if ( !sex2ploidy && !min && !max ) return ret; @@ -207,17 +208,16 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min int _min = INT_MAX, _max = -1; if ( sex2ploidy ) for (i=0; insex; i++) sex2ploidy[i] = ploidy->dflt; - while ( REGITR_OVERLAP(itr,pos,pos) ) + while ( regitr_overlap(ploidy->itr) ) { - int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex; - int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy; + int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex; + int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy; if ( pld!=ploidy->dflt ) { if ( sex2ploidy ) sex2ploidy[ sex ] = pld; if ( _min > pld ) _min = pld; if ( _max < pld ) _max = pld; } - itr.i++; } if ( _max==-1 ) _max = _min = ploidy->dflt; if ( max ) *max = _max; diff --git a/bcftools/ploidy.c.pysam.c b/bcftools/ploidy.c.pysam.c index d0468b9..2eb9bd8 100644 --- a/bcftools/ploidy.c.pysam.c +++ b/bcftools/ploidy.c.pysam.c @@ -1,6 +1,6 @@ #include "pysam.h" -/* +/* Copyright (C) 2014-2016 Genome Research Ltd. Author: Petr Danecek @@ -24,7 +24,6 @@ THE SOFTWARE. */ -#include #include #include #include @@ -37,6 +36,7 @@ struct _ploidy_t int dflt, min, max; // ploidy: default, min and max (only explicitly listed) int *sex2dflt; regidx_t *idx; + regitr_t *itr; void *sex2id; char **id2sex; kstring_t tmp_str; @@ -54,7 +54,7 @@ regidx_t *ploidy_regions(ploidy_t *ploidy) return ploidy->idx; } -int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) { int i, ret; ploidy_t *ploidy = (ploidy_t*) usr; @@ -70,7 +70,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v else { // Fill CHR,FROM,TO - ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL); + ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL); if ( ret!=0 ) return ret; } @@ -146,6 +146,7 @@ ploidy_t *ploidy_init(const char *fname, int dflt) ploidy_destroy(pld); return NULL; } + pld->itr = regitr_init(pld->idx); _set_defaults(pld,dflt); return pld; } @@ -158,6 +159,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) pld->min = pld->max = -1; pld->sex2id = khash_str2int_init(); pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld); + pld->itr = regitr_init(pld->idx); kstring_t tmp = {0,0,0}; const char *ss = str; @@ -172,7 +174,6 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) while ( *se && isspace(*se) ) se++; ss = se; } - regidx_insert(pld->idx,NULL); free(tmp.s); _set_defaults(pld,dflt); @@ -182,6 +183,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) void ploidy_destroy(ploidy_t *ploidy) { if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id); + if ( ploidy->itr ) regitr_destroy(ploidy->itr); if ( ploidy->idx ) regidx_destroy(ploidy->idx); free(ploidy->id2sex); free(ploidy->tmp_str.s); @@ -191,8 +193,7 @@ void ploidy_destroy(ploidy_t *ploidy) int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max) { - regitr_t itr; - int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr); + int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr); if ( !sex2ploidy && !min && !max ) return ret; @@ -209,17 +210,16 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min int _min = INT_MAX, _max = -1; if ( sex2ploidy ) for (i=0; insex; i++) sex2ploidy[i] = ploidy->dflt; - while ( REGITR_OVERLAP(itr,pos,pos) ) + while ( regitr_overlap(ploidy->itr) ) { - int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex; - int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy; + int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex; + int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy; if ( pld!=ploidy->dflt ) { if ( sex2ploidy ) sex2ploidy[ sex ] = pld; if ( _min > pld ) _min = pld; if ( _max < pld ) _max = pld; } - itr.i++; } if ( _max==-1 ) _max = _min = ploidy->dflt; if ( max ) *max = _max; diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h index 6deef73..1e7d2f7 100644 --- a/bcftools/ploidy.h +++ b/bcftools/ploidy.h @@ -55,7 +55,7 @@ #ifndef __PLOIDY_H__ #define __PLOIDY_H__ -#include +#include "regidx.h" typedef struct _ploidy_t ploidy_t; diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 8f4463f..954d43c 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -157,8 +157,9 @@ int test16(bcf1_t *b, anno16_t *a); static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) { int i, j; - long *p, tmp; - p = (long*) alloca(b->n_allele * sizeof(long)); + long p_a[16], *p=p_a, tmp; + if (b->n_allele > 16) + p = (long*) malloc(b->n_allele * sizeof(long)); memset(p, 0, sizeof(long) * b->n_allele); // Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk @@ -177,12 +178,14 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; for (i = b->n_allele - 1; i >= 0; --i) if ((p[i]&0xf) == 0) break; + if (p != p_a) + free(p); return i; } -/* f0 is minor allele fraction */ -int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) +/* f0 is freq of the ref allele */ +int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var) { double sum, g[3]; double max, f3[3], *pdg = ma->pdg + k * 3; @@ -203,6 +206,7 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) g[i] /= sum; if (g[i] > max) max = g[i], max_i = i; } + if ( !is_var ) { max_i = 2; max = g[2]; } // force 0/0 genotype if the site is non-variant max = 1. - max; if (max < 1e-308) max = 1e-308; q = (int)(-4.343 * log(max) + .499); diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c index a59ec44..f4f4271 100644 --- a/bcftools/prob1.c.pysam.c +++ b/bcftools/prob1.c.pysam.c @@ -159,8 +159,9 @@ int test16(bcf1_t *b, anno16_t *a); static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) { int i, j; - long *p, tmp; - p = (long*) alloca(b->n_allele * sizeof(long)); + long p_a[16], *p=p_a, tmp; + if (b->n_allele > 16) + p = (long*) malloc(b->n_allele * sizeof(long)); memset(p, 0, sizeof(long) * b->n_allele); // Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk @@ -179,12 +180,14 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; for (i = b->n_allele - 1; i >= 0; --i) if ((p[i]&0xf) == 0) break; + if (p != p_a) + free(p); return i; } -/* f0 is minor allele fraction */ -int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) +/* f0 is freq of the ref allele */ +int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var) { double sum, g[3]; double max, f3[3], *pdg = ma->pdg + k * 3; @@ -205,6 +208,7 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) g[i] /= sum; if (g[i] > max) max = g[i], max_i = i; } + if ( !is_var ) { max_i = 2; max = g[2]; } // force 0/0 genotype if the site is non-variant max = 1. - max; if (max < 1e-308) max = 1e-308; q = (int)(-4.343 * log(max) + .499); diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 1594d3f..a3d4b0d 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -78,7 +78,7 @@ extern "C" { void bcf_p1_destroy(bcf_p1aux_t *ma); void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); - int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); diff --git a/bcftools/regidx.c b/bcftools/regidx.c new file mode 100644 index 0000000..84646a8 --- /dev/null +++ b/bcftools/regidx.c @@ -0,0 +1,598 @@ +/* + Copyright (C) 2014-2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include "regidx.h" + +#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based + +#define iBIN(x) ((x)>>13) + +typedef struct +{ + uint32_t beg, end; +} +reg_t; + +typedef struct +{ + uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat +} +pos_t; + +typedef struct _reglist_t reglist_t; + +typedef struct +{ + uint32_t beg, end, ireg; // query coordinates and the active region + regidx_t *ridx; + reglist_t *list; + int active; +} +_itr_t; + +// List of regions for one chromosome. +struct _reglist_t +{ + uint32_t *idx, nidx; // index to list.reg+1 + uint32_t nreg, mreg; // n:used, m:allocated + reg_t *reg; // regions + void *dat; // payload data + char *seq; // sequence name + int unsorted; + +}; + +// Container of all sequences +struct _regidx_t +{ + int nseq, mseq; // n:used, m:alloced + reglist_t *seq; // regions for each sequence + void *seq2regs; // hash for fast lookup from chr name to regions + char **seq_names; + regidx_free_f free; // function to free any data allocated by regidx_parse_f + regidx_parse_f parse; // parse one input line + void *usr; // user data to pass to regidx_parse_f + int payload_size; + void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand) + kstring_t str; +}; + +int regidx_seq_nregs(regidx_t *idx, const char *seq) +{ + int iseq; + if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence + return idx->seq[iseq].nreg; +} + +int regidx_nregs(regidx_t *idx) +{ + int i, nreg = 0; + for (i=0; inseq; i++) nreg += idx->seq[i].nreg; + return nreg; +} + +char **regidx_seq_names(regidx_t *idx, int *n) +{ + *n = idx->nseq; + return idx->seq_names; +} + +int regidx_insert_list(regidx_t *idx, char *line, char delim) +{ + kstring_t tmp = {0,0,0}; + char *ss = line; + while ( *ss ) + { + char *se = ss; + while ( *se && *se!=delim ) se++; + tmp.l = 0; + kputsn(ss, se-ss, &tmp); + if ( regidx_insert(idx,tmp.s) < 0 ) + { + free(tmp.s); + return -1; + } + if ( !*se ) break; + ss = se+1; + } + free(tmp.s); + return 0; +} + +static inline int cmp_regs(reg_t *a, reg_t *b) +{ + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + if ( a->end < b->end ) return 1; // longer intervals come first + if ( a->end > b->end ) return -1; + return 0; +} +static int cmp_reg_ptrs(const void *a, const void *b) +{ + return cmp_regs((reg_t*)a,(reg_t*)b); +} +static int cmp_reg_ptrs2(const void *a, const void *b) +{ + return cmp_regs(*((reg_t**)a),*((reg_t**)b)); +} + +inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload) +{ + if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; + if ( end > MAX_COOR_0 ) end = MAX_COOR_0; + + int rid; + idx->str.l = 0; + kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); + if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) + { + // new chromosome + idx->nseq++; + int m_prev = idx->mseq; + hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); + hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); + idx->seq_names[idx->nseq-1] = strdup(idx->str.s); + rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); + } + + reglist_t *list = &idx->seq[rid]; + list->seq = idx->seq_names[rid]; + list->nreg++; + int mreg = list->mreg; + hts_expand(reg_t,list->nreg,list->mreg,list->reg); + list->reg[list->nreg-1].beg = beg; + list->reg[list->nreg-1].end = end; + if ( idx->payload_size ) + { + if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg); + memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size); + } + if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1; + return 0; +} + +int regidx_insert(regidx_t *idx, char *line) +{ + if ( !line ) return 0; + char *chr_from, *chr_to; + uint32_t beg,end; + int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr); + if ( ret==-2 ) return -1; // error + if ( ret==-1 ) return 0; // skip the line + regidx_push(idx, chr_from,chr_to,beg,end,idx->payload); + return 0; +} + +regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) +{ + if ( !parser ) + { + if ( !fname ) parser = regidx_parse_tab; + else + { + int len = strlen(fname); + if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) ) + parser = regidx_parse_bed; + else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) ) + parser = regidx_parse_bed; + else if ( len>=4 && !strcasecmp(".bed",fname+len-4) ) + parser = regidx_parse_bed; + else + parser = regidx_parse_tab; + } + } + + regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + idx->free = free_f; + idx->parse = parser; + idx->usr = usr_dat; + idx->seq2regs = khash_str2int_init(); + idx->payload_size = payload_size; + if ( payload_size ) idx->payload = malloc(payload_size); + + if ( !fname ) return idx; + + kstring_t str = {0,0,0}; + + htsFile *fp = hts_open(fname,"r"); + if ( !fp ) goto error; + + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + if ( regidx_insert(idx, str.s) ) goto error; + } + + free(str.s); + hts_close(fp); + return idx; + +error: + free(str.s); + if ( fp ) hts_close(fp); + regidx_destroy(idx); + return NULL; +} + +void regidx_destroy(regidx_t *idx) +{ + int i, j; + for (i=0; inseq; i++) + { + reglist_t *list = &idx->seq[i]; + if ( idx->free ) + { + for (j=0; jnreg; j++) + idx->free((char *)list->dat + idx->payload_size*j); + } + free(list->dat); + free(list->reg); + free(list->idx); + } + free(idx->seq_names); + free(idx->seq); + free(idx->str.s); + free(idx->payload); + khash_str2int_destroy_free(idx->seq2regs); + free(idx); +} + +int _reglist_build_index(regidx_t *regidx, reglist_t *list) +{ + int i; + if ( list->unsorted ) + { + if ( !regidx->payload_size ) + qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); + else + { + reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); + for (i=0; inreg; i++) ptr[i] = list->reg + i; + qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); + + void *tmp_dat = malloc(regidx->payload_size*list->nreg); + for (i=0; inreg; i++) + { + size_t iori = ptr[i] - list->reg; + memcpy((char *)tmp_dat+i*regidx->payload_size, + (char *)list->dat+iori*regidx->payload_size, + regidx->payload_size); + } + free(list->dat); + list->dat = tmp_dat; + + reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); + for (i=0; inreg; i++) + { + size_t iori = ptr[i] - list->reg; + tmp_reg[i] = list->reg[iori]; + } + free(ptr); + free(list->reg); + list->reg = tmp_reg; + list->mreg = list->nreg; + } + list->unsorted = 0; + } + + list->nidx = 0; + int j,k, midx = 0; + for (j=0; jnreg; j++) + { + int ibeg = iBIN(list->reg[j].beg); + int iend = iBIN(list->reg[j].end); + if ( midx <= iend ) + { + int old_midx = midx; + midx = iend + 1; + kroundup32(midx); + list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t)); + memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx)); + } + if ( ibeg==iend ) + { + if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1; + } + else + { + for (k=ibeg; k<=iend; k++) + if ( !list->idx[k] ) list->idx[k] = j + 1; + } + if ( list->nidx < iend+1 ) list->nidx = iend+1; + } + + return 0; +} + +int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr) +{ + if ( regitr ) regitr->seq = NULL; + + int iseq, ireg; + if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence + + reglist_t *list = ®idx->seq[iseq]; + if ( !list->nreg ) return 0; + + if ( list->nreg==1 ) + { + if ( beg > list->reg[0].end ) return 0; + if ( end < list->reg[0].beg ) return 0; + ireg = 0; + } + else + { + if ( !list->idx ) + _reglist_build_index(regidx,list); + + int ibeg = iBIN(beg); + if ( ibeg >= list->nidx ) return 0; // beg is too big + + // find a matching region + uint32_t i = list->idx[ibeg]; + if ( !i ) + { + int iend = iBIN(end); + if ( iend > list->nidx ) iend = list->nidx; + for (i=ibeg; iidx[i] ) break; + if ( i==iend ) return 0; + i = list->idx[i]; + } + + for (ireg=i-1; iregnreg; ireg++) + { + if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region + if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found + } + + if ( ireg >= list->nreg ) return 0; // no match + } + + if ( !regitr ) return 1; // match, but no more info to save + + // may need to iterate over the matching regions later + _itr_t *itr = (_itr_t*)regitr->itr; + itr->ridx = regidx; + itr->list = list; + itr->beg = beg; + itr->end = end; + itr->ireg = ireg; + itr->active = 0; + + regitr->seq = list->seq; + regitr->beg = list->reg[ireg].beg; + regitr->end = list->reg[ireg].end; + if ( regidx->payload_size ) + regitr->payload = (char *)list->dat + regidx->payload_size*ireg; + + return 1; +} + +int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && !isspace(*se) ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + // just the chromosome name + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + + ss = se+1; + *beg = strtod(ss, &se); + if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + ss = se+1; + *end = strtod(ss, &se) - 1; + if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + return 0; +} + +int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && !isspace(*se) ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + // just the chromosome name + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + + ss = se+1; + *beg = strtod(ss, &se); + if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; } + if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } + (*beg)--; + + if ( !se[0] || !se[1] ) + *end = *beg; + else + { + ss = se+1; + *end = strtod(ss, &se); + if ( ss==se ) *end = *beg; + else if ( *end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } + else (*end)--; + } + return 0; +} + +int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && *se!=':' ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + + ss = se+1; + *beg = strtod(ss, &se); + if ( ss==se ) { fprintf(stderr,"Could not parse reg line: %s\n", line); return -2; } + if ( *beg==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } + (*beg)--; + + if ( !se[0] || !se[1] ) + *end = se[0]=='-' ? MAX_COOR_0 : *beg; + else + { + ss = se+1; + *end = strtod(ss, &se); + if ( ss==se ) *end = *beg; + else if ( *end==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } + else (*end)--; + } + return 0; +} + +regitr_t *regitr_init(regidx_t *regidx) +{ + regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t)); + regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t)); + _itr_t *itr = (_itr_t*) regitr->itr; + itr->ridx = regidx; + itr->list = NULL; + return regitr; +} + +void regitr_reset(regidx_t *regidx, regitr_t *regitr) +{ + _itr_t *itr = (_itr_t*) regitr->itr; + memset(itr,0,sizeof(_itr_t)); + itr->ridx = regidx; +} + +void regitr_destroy(regitr_t *regitr) +{ + free(regitr->itr); + free(regitr); +} + +int regitr_overlap(regitr_t *regitr) +{ + if ( !regitr->seq ) return 0; + + _itr_t *itr = (_itr_t*) regitr->itr; + if ( !itr->active ) + { + // is this the first call after regidx_overlap? + itr->active = 1; + itr->ireg++; + return 1; + } + + reglist_t *list = itr->list; + + int i; + for (i=itr->ireg; inreg; i++) + { + if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region + if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found + } + + if ( i >= list->nreg ) return 0; // no match + + itr->ireg = i + 1; + regitr->seq = list->seq; + regitr->beg = list->reg[i].beg; + regitr->end = list->reg[i].end; + if ( itr->ridx->payload_size ) + regitr->payload = (char *)list->dat + itr->ridx->payload_size*i; + + return 1; +} + +int regitr_loop(regitr_t *regitr) +{ + _itr_t *itr = (_itr_t*) regitr->itr; + regidx_t *regidx = itr->ridx; + + if ( !itr->list ) // first time here + { + itr->list = regidx->seq; + itr->ireg = 0; + } + + size_t iseq = itr->list - regidx->seq; + if ( iseq >= regidx->nseq ) return 0; + + if ( itr->ireg >= itr->list->nreg ) + { + iseq++; + if ( iseq >= regidx->nseq ) return 0; // no more sequences, done + itr->ireg = 0; + itr->list = ®idx->seq[iseq]; + } + + regitr->seq = itr->list->seq; + regitr->beg = itr->list->reg[itr->ireg].beg; + regitr->end = itr->list->reg[itr->ireg].end; + if ( regidx->payload_size ) + regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg; + itr->ireg++; + + return 1; +} + + + diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c new file mode 100644 index 0000000..4d6dcda --- /dev/null +++ b/bcftools/regidx.c.pysam.c @@ -0,0 +1,600 @@ +#include "pysam.h" + +/* + Copyright (C) 2014-2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include "regidx.h" + +#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based + +#define iBIN(x) ((x)>>13) + +typedef struct +{ + uint32_t beg, end; +} +reg_t; + +typedef struct +{ + uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat +} +pos_t; + +typedef struct _reglist_t reglist_t; + +typedef struct +{ + uint32_t beg, end, ireg; // query coordinates and the active region + regidx_t *ridx; + reglist_t *list; + int active; +} +_itr_t; + +// List of regions for one chromosome. +struct _reglist_t +{ + uint32_t *idx, nidx; // index to list.reg+1 + uint32_t nreg, mreg; // n:used, m:allocated + reg_t *reg; // regions + void *dat; // payload data + char *seq; // sequence name + int unsorted; + +}; + +// Container of all sequences +struct _regidx_t +{ + int nseq, mseq; // n:used, m:alloced + reglist_t *seq; // regions for each sequence + void *seq2regs; // hash for fast lookup from chr name to regions + char **seq_names; + regidx_free_f free; // function to free any data allocated by regidx_parse_f + regidx_parse_f parse; // parse one input line + void *usr; // user data to pass to regidx_parse_f + int payload_size; + void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand) + kstring_t str; +}; + +int regidx_seq_nregs(regidx_t *idx, const char *seq) +{ + int iseq; + if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence + return idx->seq[iseq].nreg; +} + +int regidx_nregs(regidx_t *idx) +{ + int i, nreg = 0; + for (i=0; inseq; i++) nreg += idx->seq[i].nreg; + return nreg; +} + +char **regidx_seq_names(regidx_t *idx, int *n) +{ + *n = idx->nseq; + return idx->seq_names; +} + +int regidx_insert_list(regidx_t *idx, char *line, char delim) +{ + kstring_t tmp = {0,0,0}; + char *ss = line; + while ( *ss ) + { + char *se = ss; + while ( *se && *se!=delim ) se++; + tmp.l = 0; + kputsn(ss, se-ss, &tmp); + if ( regidx_insert(idx,tmp.s) < 0 ) + { + free(tmp.s); + return -1; + } + if ( !*se ) break; + ss = se+1; + } + free(tmp.s); + return 0; +} + +static inline int cmp_regs(reg_t *a, reg_t *b) +{ + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + if ( a->end < b->end ) return 1; // longer intervals come first + if ( a->end > b->end ) return -1; + return 0; +} +static int cmp_reg_ptrs(const void *a, const void *b) +{ + return cmp_regs((reg_t*)a,(reg_t*)b); +} +static int cmp_reg_ptrs2(const void *a, const void *b) +{ + return cmp_regs(*((reg_t**)a),*((reg_t**)b)); +} + +inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload) +{ + if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; + if ( end > MAX_COOR_0 ) end = MAX_COOR_0; + + int rid; + idx->str.l = 0; + kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); + if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) + { + // new chromosome + idx->nseq++; + int m_prev = idx->mseq; + hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); + hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); + idx->seq_names[idx->nseq-1] = strdup(idx->str.s); + rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); + } + + reglist_t *list = &idx->seq[rid]; + list->seq = idx->seq_names[rid]; + list->nreg++; + int mreg = list->mreg; + hts_expand(reg_t,list->nreg,list->mreg,list->reg); + list->reg[list->nreg-1].beg = beg; + list->reg[list->nreg-1].end = end; + if ( idx->payload_size ) + { + if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg); + memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size); + } + if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1; + return 0; +} + +int regidx_insert(regidx_t *idx, char *line) +{ + if ( !line ) return 0; + char *chr_from, *chr_to; + uint32_t beg,end; + int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr); + if ( ret==-2 ) return -1; // error + if ( ret==-1 ) return 0; // skip the line + regidx_push(idx, chr_from,chr_to,beg,end,idx->payload); + return 0; +} + +regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) +{ + if ( !parser ) + { + if ( !fname ) parser = regidx_parse_tab; + else + { + int len = strlen(fname); + if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) ) + parser = regidx_parse_bed; + else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) ) + parser = regidx_parse_bed; + else if ( len>=4 && !strcasecmp(".bed",fname+len-4) ) + parser = regidx_parse_bed; + else + parser = regidx_parse_tab; + } + } + + regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + idx->free = free_f; + idx->parse = parser; + idx->usr = usr_dat; + idx->seq2regs = khash_str2int_init(); + idx->payload_size = payload_size; + if ( payload_size ) idx->payload = malloc(payload_size); + + if ( !fname ) return idx; + + kstring_t str = {0,0,0}; + + htsFile *fp = hts_open(fname,"r"); + if ( !fp ) goto error; + + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + if ( regidx_insert(idx, str.s) ) goto error; + } + + free(str.s); + hts_close(fp); + return idx; + +error: + free(str.s); + if ( fp ) hts_close(fp); + regidx_destroy(idx); + return NULL; +} + +void regidx_destroy(regidx_t *idx) +{ + int i, j; + for (i=0; inseq; i++) + { + reglist_t *list = &idx->seq[i]; + if ( idx->free ) + { + for (j=0; jnreg; j++) + idx->free((char *)list->dat + idx->payload_size*j); + } + free(list->dat); + free(list->reg); + free(list->idx); + } + free(idx->seq_names); + free(idx->seq); + free(idx->str.s); + free(idx->payload); + khash_str2int_destroy_free(idx->seq2regs); + free(idx); +} + +int _reglist_build_index(regidx_t *regidx, reglist_t *list) +{ + int i; + if ( list->unsorted ) + { + if ( !regidx->payload_size ) + qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); + else + { + reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); + for (i=0; inreg; i++) ptr[i] = list->reg + i; + qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); + + void *tmp_dat = malloc(regidx->payload_size*list->nreg); + for (i=0; inreg; i++) + { + size_t iori = ptr[i] - list->reg; + memcpy((char *)tmp_dat+i*regidx->payload_size, + (char *)list->dat+iori*regidx->payload_size, + regidx->payload_size); + } + free(list->dat); + list->dat = tmp_dat; + + reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); + for (i=0; inreg; i++) + { + size_t iori = ptr[i] - list->reg; + tmp_reg[i] = list->reg[iori]; + } + free(ptr); + free(list->reg); + list->reg = tmp_reg; + list->mreg = list->nreg; + } + list->unsorted = 0; + } + + list->nidx = 0; + int j,k, midx = 0; + for (j=0; jnreg; j++) + { + int ibeg = iBIN(list->reg[j].beg); + int iend = iBIN(list->reg[j].end); + if ( midx <= iend ) + { + int old_midx = midx; + midx = iend + 1; + kroundup32(midx); + list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t)); + memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx)); + } + if ( ibeg==iend ) + { + if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1; + } + else + { + for (k=ibeg; k<=iend; k++) + if ( !list->idx[k] ) list->idx[k] = j + 1; + } + if ( list->nidx < iend+1 ) list->nidx = iend+1; + } + + return 0; +} + +int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr) +{ + if ( regitr ) regitr->seq = NULL; + + int iseq, ireg; + if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence + + reglist_t *list = ®idx->seq[iseq]; + if ( !list->nreg ) return 0; + + if ( list->nreg==1 ) + { + if ( beg > list->reg[0].end ) return 0; + if ( end < list->reg[0].beg ) return 0; + ireg = 0; + } + else + { + if ( !list->idx ) + _reglist_build_index(regidx,list); + + int ibeg = iBIN(beg); + if ( ibeg >= list->nidx ) return 0; // beg is too big + + // find a matching region + uint32_t i = list->idx[ibeg]; + if ( !i ) + { + int iend = iBIN(end); + if ( iend > list->nidx ) iend = list->nidx; + for (i=ibeg; iidx[i] ) break; + if ( i==iend ) return 0; + i = list->idx[i]; + } + + for (ireg=i-1; iregnreg; ireg++) + { + if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region + if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found + } + + if ( ireg >= list->nreg ) return 0; // no match + } + + if ( !regitr ) return 1; // match, but no more info to save + + // may need to iterate over the matching regions later + _itr_t *itr = (_itr_t*)regitr->itr; + itr->ridx = regidx; + itr->list = list; + itr->beg = beg; + itr->end = end; + itr->ireg = ireg; + itr->active = 0; + + regitr->seq = list->seq; + regitr->beg = list->reg[ireg].beg; + regitr->end = list->reg[ireg].end; + if ( regidx->payload_size ) + regitr->payload = (char *)list->dat + regidx->payload_size*ireg; + + return 1; +} + +int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && !isspace(*se) ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + // just the chromosome name + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + + ss = se+1; + *beg = strtod(ss, &se); + if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; } + + ss = se+1; + *end = strtod(ss, &se) - 1; + if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; } + + return 0; +} + +int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && !isspace(*se) ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + // just the chromosome name + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + + ss = se+1; + *beg = strtod(ss, &se); + if ( ss==se ) { fprintf(pysam_stderr,"Could not parse tab line: %s\n", line); return -2; } + if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } + (*beg)--; + + if ( !se[0] || !se[1] ) + *end = *beg; + else + { + ss = se+1; + *end = strtod(ss, &se); + if ( ss==se ) *end = *beg; + else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } + else (*end)--; + } + return 0; +} + +int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && *se!=':' ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + + ss = se+1; + *beg = strtod(ss, &se); + if ( ss==se ) { fprintf(pysam_stderr,"Could not parse reg line: %s\n", line); return -2; } + if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } + (*beg)--; + + if ( !se[0] || !se[1] ) + *end = se[0]=='-' ? MAX_COOR_0 : *beg; + else + { + ss = se+1; + *end = strtod(ss, &se); + if ( ss==se ) *end = *beg; + else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } + else (*end)--; + } + return 0; +} + +regitr_t *regitr_init(regidx_t *regidx) +{ + regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t)); + regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t)); + _itr_t *itr = (_itr_t*) regitr->itr; + itr->ridx = regidx; + itr->list = NULL; + return regitr; +} + +void regitr_reset(regidx_t *regidx, regitr_t *regitr) +{ + _itr_t *itr = (_itr_t*) regitr->itr; + memset(itr,0,sizeof(_itr_t)); + itr->ridx = regidx; +} + +void regitr_destroy(regitr_t *regitr) +{ + free(regitr->itr); + free(regitr); +} + +int regitr_overlap(regitr_t *regitr) +{ + if ( !regitr->seq ) return 0; + + _itr_t *itr = (_itr_t*) regitr->itr; + if ( !itr->active ) + { + // is this the first call after regidx_overlap? + itr->active = 1; + itr->ireg++; + return 1; + } + + reglist_t *list = itr->list; + + int i; + for (i=itr->ireg; inreg; i++) + { + if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region + if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found + } + + if ( i >= list->nreg ) return 0; // no match + + itr->ireg = i + 1; + regitr->seq = list->seq; + regitr->beg = list->reg[i].beg; + regitr->end = list->reg[i].end; + if ( itr->ridx->payload_size ) + regitr->payload = (char *)list->dat + itr->ridx->payload_size*i; + + return 1; +} + +int regitr_loop(regitr_t *regitr) +{ + _itr_t *itr = (_itr_t*) regitr->itr; + regidx_t *regidx = itr->ridx; + + if ( !itr->list ) // first time here + { + itr->list = regidx->seq; + itr->ireg = 0; + } + + size_t iseq = itr->list - regidx->seq; + if ( iseq >= regidx->nseq ) return 0; + + if ( itr->ireg >= itr->list->nreg ) + { + iseq++; + if ( iseq >= regidx->nseq ) return 0; // no more sequences, done + itr->ireg = 0; + itr->list = ®idx->seq[iseq]; + } + + regitr->seq = itr->list->seq; + regitr->beg = itr->list->reg[itr->ireg].beg; + regitr->end = itr->list->reg[itr->ireg].end; + if ( regidx->payload_size ) + regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg; + itr->ireg++; + + return 1; +} + + + diff --git a/bcftools/regidx.h b/bcftools/regidx.h new file mode 100644 index 0000000..8e25fe1 --- /dev/null +++ b/bcftools/regidx.h @@ -0,0 +1,191 @@ +/* + Copyright (C) 2014-2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + Region indexing with an optional payload. + + Example of usage: + + // Init the parser and print regions. In this example the payload is a + // pointer to a string. For the description of parse_custom and + // free_custom functions, see regidx_parse_f and regidx_free_f below, + // and for working example see test/test-regidx.c. + regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); + + // Query overlap with chr:from-to + regitr_t *itr = regitr_init(idx); + if ( regidx_overlap(idx, chr,from,to, itr) ) printf("There is an overlap!\n"); + + while ( regitr_overlap(itr) ) + { + printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to, + itr->beg, itr->end, regitr_payload(itr,char*)); + } + + regidx_destroy(idx); + regitr_destroy(itr); + + + Another example, loop over all regions: + + regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); + regitr_t *itr = regitr_init(idx); + + while ( regitr_loop(itr) ) + printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg, itr->end); + + regidx_destroy(idx); + regitr_destroy(itr); +*/ + +#ifndef __REGIDX_H__ +#define __REGIDX_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define REGIDX_MAX 2147483646 // maximum regidx coordinate (0-based) + +typedef struct _regidx_t regidx_t; +typedef struct +{ + uint32_t beg,end; + void *payload; + char *seq; + void *itr; +} +regitr_t; + +#define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload)) + +/* + * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed + * or regidx_parse_tab below. The function is expected to set `chr_from` and + * `chr_to` to point to first and last character of chromosome name and set + * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was + * called with non-zero payload_size, the `payload` points to a memory + * location of the payload_size and `usr` is the data passed to regidx_init(). + * Any memory allocated by the function will be freed by regidx_free_f called + * by regidx_destroy(). + * + * Return value: 0 on success, -1 to skip a record, -2 on fatal error. + */ +typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr); +typedef void (*regidx_free_f)(void *payload); + +/* + * A note about the parsers: + * - leading spaces are ignored + * - lines starting with "#" are ignored + */ +int regidx_parse_bed(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open) +int regidx_parse_tab(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive) +int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive) + +/* + * regidx_init() - creates new index + * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() + * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, + * the format will be autodected, currently either regidx_parse_tab (the default) or + * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that + * the exact autodetection algorithm will change. + * @param freef: NULL or see description of regidx_parse_f + * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f + * @param usr: optional user data passed to regidx_parse_f + * + * Returns index on success or NULL on error. + */ +regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); + +/* + * regidx_destroy() - free memory allocated by regidx_init + */ +void regidx_destroy(regidx_t *idx); + +/* + * regidx_overlap() - check overlap of the location chr:from-to with regions + * @param beg,end: 0-based start, end coordinate (inclusive) + * @param itr: pointer to iterator, can be NULL if regidx_loop not needed + * + * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping + * regions can be iterated as shown in the example above. + */ +int regidx_overlap(regidx_t *idx, const char *chr, uint32_t beg, uint32_t end, regitr_t *itr); + +/* + * regidx_insert() - add a new region. + * regidx_insert_list() - add new regions from a list + * regidx_push() - low level insertion of a new region + * + * Returns 0 on success or -1 on error. + */ +int regidx_insert(regidx_t *idx, char *line); +int regidx_insert_list(regidx_t *idx, char *line, char delim); +int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload); + +/* + * regidx_seq_names() - return list of all sequence names + */ +char **regidx_seq_names(regidx_t *idx, int *n); + +/* + * regidx_seq_nregs() - number of regions + * regidx_nregs() - total number of regions + */ +int regidx_seq_nregs(regidx_t *idx, const char *seq); +int regidx_nregs(regidx_t *idx); + +/* + * regitr_init() - initialize an iterator. The idx parameter is required only + * with regitr_loop. If only regitr_overlap is called, NULL + * can be given. + * + * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle. + * Not required with regitr_overlap. + */ +regitr_t *regitr_init(regidx_t *idx); +void regitr_destroy(regitr_t *itr); +void regitr_reset(regidx_t *idx, regitr_t *itr); + +/* + * regitr_overlap() - next overlapping region + * Returns 0 when done or 1 when itr is set to next region + */ +int regitr_overlap(regitr_t *itr); + +/* + * regitr_loop() - loop over all regions + * Returns 0 when done or 1 when itr is set to next region + */ +int regitr_loop(regitr_t *itr); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c new file mode 100644 index 0000000..c7fa913 --- /dev/null +++ b/bcftools/smpl_ilist.c @@ -0,0 +1,106 @@ +/* + Copyright (C) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "bcftools.h" +#include "smpl_ilist.h" + +void smpl_ilist_destroy(smpl_ilist_t *smpl) +{ + free(smpl->idx); + free(smpl); +} + +smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags) +{ + smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t)); + + int i; + if ( !sample_list ) + { + smpl->n = bcf_hdr_nsamples(hdr); + smpl->idx = (int*) malloc(sizeof(int)*smpl->n); + for (i=0; in; i++) smpl->idx[i] = i; + return smpl; + } + + int nlist; + char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist); + if ( !list ) error("Could not parse %s\n", sample_list); + + // preserve the VCF order + int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); + for (i=0; i=0 ) + { + tmp[idx] = 1; + smpl->n++; + } + else if ( flags&SMPL_STRICT ) + error("No such sample: %s\n", list[i]); + } + + if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; + smpl->idx = (int*) malloc(sizeof(int)*smpl->n); + + int j = 0; + if ( sample_list[0]!='^' ) + { + for (i=0; iidx[j++] = i; + } + else + { + for (i=0; iidx[j++] = i; + } + + free(tmp); + for (i=0; in = bcf_hdr_nsamples(hdr_a); + smpl->idx = (int*) malloc(sizeof(int)*smpl->n); + for (i=0; in; i++) + { + const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i); + smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name); + if ( flags&SMPL_STRICT && smpl->idx[i]<0 ) + error("The sample %s is not present in the second file\n", name); + } + return smpl; +} + diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c new file mode 100644 index 0000000..f52b8ce --- /dev/null +++ b/bcftools/smpl_ilist.c.pysam.c @@ -0,0 +1,108 @@ +#include "pysam.h" + +/* + Copyright (C) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "bcftools.h" +#include "smpl_ilist.h" + +void smpl_ilist_destroy(smpl_ilist_t *smpl) +{ + free(smpl->idx); + free(smpl); +} + +smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags) +{ + smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t)); + + int i; + if ( !sample_list ) + { + smpl->n = bcf_hdr_nsamples(hdr); + smpl->idx = (int*) malloc(sizeof(int)*smpl->n); + for (i=0; in; i++) smpl->idx[i] = i; + return smpl; + } + + int nlist; + char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist); + if ( !list ) error("Could not parse %s\n", sample_list); + + // preserve the VCF order + int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); + for (i=0; i=0 ) + { + tmp[idx] = 1; + smpl->n++; + } + else if ( flags&SMPL_STRICT ) + error("No such sample: %s\n", list[i]); + } + + if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; + smpl->idx = (int*) malloc(sizeof(int)*smpl->n); + + int j = 0; + if ( sample_list[0]!='^' ) + { + for (i=0; iidx[j++] = i; + } + else + { + for (i=0; iidx[j++] = i; + } + + free(tmp); + for (i=0; in = bcf_hdr_nsamples(hdr_a); + smpl->idx = (int*) malloc(sizeof(int)*smpl->n); + for (i=0; in; i++) + { + const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i); + smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name); + if ( flags&SMPL_STRICT && smpl->idx[i]<0 ) + error("The sample %s is not present in the second file\n", name); + } + return smpl; +} + diff --git a/bcftools/smpl_ilist.h b/bcftools/smpl_ilist.h new file mode 100644 index 0000000..7083cf2 --- /dev/null +++ b/bcftools/smpl_ilist.h @@ -0,0 +1,47 @@ +/* + Copyright (C) 2016 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ +/* + Parse --samples and --samples-file +*/ + +#ifndef __SMPL_ILIST_H__ +#define __SMPL_ILIST_H__ + +#include + +#define SMPL_NONE 0 // flexible error recovery +#define SMPL_STRICT 1 // samples must exist + +typedef struct +{ + int *idx; // index to bcf_hdr_t.samples + int n; +} +smpl_ilist_t; + +smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags); +smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags); +void smpl_ilist_destroy(smpl_ilist_t *smpl); + +#endif diff --git a/bcftools/tabix.c b/bcftools/tabix.c index 2f24b92..c1874c2 100644 --- a/bcftools/tabix.c +++ b/bcftools/tabix.c @@ -1,7 +1,7 @@ /* tabix.c -- tabix subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013, 2016 Genome Research Ltd. Author: Heng Li @@ -32,8 +32,8 @@ THE SOFTWARE. */ int main_tabix(int argc, char *argv[]) { - int c, min_shift = -1, is_force = 0, is_all = 0; - tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; + int c, min_shift = -1, is_force = 0, is_all = 0, detect = 1; + tbx_conf_t conf = tbx_conf_gff; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; @@ -45,13 +45,14 @@ int main_tabix(int argc, char *argv[]) else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { - if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; - else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; - else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; - else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; + if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff; + else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed; + else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam; + else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf; else { fprintf(stderr, "The type '%s' not recognised\n", optarg); return 1; + detect = 0; } } @@ -79,28 +80,29 @@ int main_tabix(int argc, char *argv[]) bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index - if ( !conf_ptr ) + if ( detect ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); - if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; + if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff; + else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed; + else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam; + else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf = tbx_conf_vcf; } - if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; - fn = (char*)alloca(strlen(argv[optind]) + 5); + fn = (char*)malloc(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); + free(fn); fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } + free(fn); } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { diff --git a/bcftools/tabix.c.pysam.c b/bcftools/tabix.c.pysam.c index afa3619..b0c6e0e 100644 --- a/bcftools/tabix.c.pysam.c +++ b/bcftools/tabix.c.pysam.c @@ -3,7 +3,7 @@ /* tabix.c -- tabix subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013, 2016 Genome Research Ltd. Author: Heng Li @@ -34,8 +34,8 @@ THE SOFTWARE. */ int main_tabix(int argc, char *argv[]) { - int c, min_shift = -1, is_force = 0, is_all = 0; - tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; + int c, min_shift = -1, is_force = 0, is_all = 0, detect = 1; + tbx_conf_t conf = tbx_conf_gff; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; @@ -47,13 +47,14 @@ int main_tabix(int argc, char *argv[]) else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { - if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; - else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; - else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; - else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; + if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff; + else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed; + else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam; + else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf; else { fprintf(pysam_stderr, "The type '%s' not recognised\n", optarg); return 1; + detect = 0; } } @@ -81,28 +82,29 @@ int main_tabix(int argc, char *argv[]) bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index - if ( !conf_ptr ) + if ( detect ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); - if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; + if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff; + else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed; + else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam; + else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf = tbx_conf_vcf; } - if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; - fn = (char*)alloca(strlen(argv[optind]) + 5); + fn = (char*)malloc(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); + free(fn); fprintf(pysam_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } + free(fn); } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { diff --git a/bcftools/tsv2vcf.c b/bcftools/tsv2vcf.c index 8826f18..2e1aa52 100644 --- a/bcftools/tsv2vcf.c +++ b/bcftools/tsv2vcf.c @@ -24,6 +24,7 @@ */ #include +#include #include "tsv2vcf.h" tsv_t *tsv_init(const char *str) diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c index 1da48d5..f5eff01 100644 --- a/bcftools/tsv2vcf.c.pysam.c +++ b/bcftools/tsv2vcf.c.pysam.c @@ -26,6 +26,7 @@ */ #include +#include #include "tsv2vcf.h" tsv_t *tsv_init(const char *str) diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index d5164f3..e6efda9 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -41,6 +42,7 @@ THE SOFTWARE. */ #include "vcmp.h" #include "filter.h" #include "convert.h" +#include "smpl_ilist.h" struct _args_t; @@ -65,12 +67,12 @@ annot_line_t; #define REPLACE_MISSING 0 // replace only missing values #define REPLACE_ALL 1 // replace both missing and existing values -#define REPLACE_EXISTING 2 // replace only if tgt is not missing +#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing #define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise typedef struct _annot_col_t { int icol, replace, number; // number: one of BCF_VL_* types - char *hdr_key; + char *hdr_key_src, *hdr_key_dst; int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); } annot_col_t; @@ -109,6 +111,7 @@ typedef struct _args_t convert_t *set_ids; int set_ids_replace; + int nsmpl_annot; int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc int mtmpi, mtmpf, mtmps; int mtmpi2, mtmpf2, mtmps2; @@ -155,6 +158,7 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag) } line->d.shared_dirty |= BCF1_DIRTY_INF; inf->vptr = NULL; + inf->vptr_off = inf->vptr_len = 0; } } void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag) @@ -187,6 +191,10 @@ void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag) } } +#include "htslib/khash.h" +KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +typedef khash_t(vdict) vdict_t; + static void remove_hdr_lines(bcf_hdr_t *hdr, int type) { int i = 0, nrm = 0; @@ -194,11 +202,18 @@ static void remove_hdr_lines(bcf_hdr_t *hdr, int type) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } bcf_hrec_t *hrec = hdr->hrec[i]; - if ( type==BCF_HL_FMT ) + if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) { // everything except FORMAT/GT int id = bcf_hrec_find_key(hrec, "ID"); - if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; } + if ( id>=0 ) + { + if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; } + vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; + khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]); + kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; + kh_val(d, k).info[type] |= 0xf; + } } nrm++; hdr->nhrec--; @@ -453,7 +468,7 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) line->qual = strtod(str, &str); if ( str == tab->cols[col->icol] ) - error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); return 0; } static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -470,31 +485,31 @@ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void * char *str = tab->cols[col->icol]; if ( str[0]=='.' && str[1]==0 ) return 0; - if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1); - if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0); + if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); + if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); return -1; } static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL); - bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag); + int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL); + bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag); return 0; } static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) { if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); if ( !map ) error("REF alleles not compatible at %s:%d\n"); // fill in any missing values in the target VCF (or all, if not present) - int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2); + int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2); int i; @@ -511,7 +526,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpi2[i] = args->tmpi[ map[i] ]; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst); + bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); return 0; } static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -537,17 +552,17 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2); + int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi); + bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); return 0; } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi); + int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); if ( ntmpi < 0 ) return 0; // nothing to add if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) @@ -555,26 +570,26 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2); + int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi); + bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); return 0; } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); if ( !map ) error("REF alleles not compatible at %s:%d\n"); // fill in any missing values in the target VCF (or all, if not present) - int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2); + int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2); int i; @@ -591,7 +606,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpf2[i] = args->tmpf[ map[i] ]; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst); + bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); return 0; } static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -617,17 +632,17 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2); + int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf); + bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); return 0; } static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf); + int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); if ( ntmpf < 0 ) return 0; // nothing to add if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) @@ -635,11 +650,11 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2); + int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf); + bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); return 0; } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c @@ -652,9 +667,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in lsrc++; } if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); @@ -662,7 +677,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in // fill in any missing values in the target VCF (or all, if not present) int i, empty = 0, nstr, mstr = args->tmpks.m; - nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr); + nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); args->tmpks.m = mstr; if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) ) { @@ -695,7 +710,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i); assert( ret==0 ); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s); + bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); return 0; } static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -712,17 +727,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2); + int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps); + bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); return 0; } static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps); + int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps); if ( ntmps < 0 ) return 0; // nothing to add if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) @@ -730,11 +745,11 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2); + int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps); + bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); return 0; } static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -752,7 +767,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) // field not present in dst file { - if ( col->replace==REPLACE_EXISTING ) return 0; + if ( col->replace==REPLACE_NON_MISSING ) return 0; hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { @@ -777,7 +792,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( args->sample_map[i]==-1 ) continue; int32_t *src = args->tmpi + nsrc*args->sample_map[i]; int32_t *dst = args->tmpi2 + ndst*i; - if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue; + if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue; if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue; for (j=0; jtmpi3 + nsrc*i; int keep_ori = 0; if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; + else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1; if ( keep_ori ) { @@ -811,7 +826,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo } static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) { - int i, nmax = 0; + int i, nmax = 1; for (i=icol_beg; icols[i], *end = str; @@ -831,298 +846,306 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) } return nmax; } -static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -{ - annot_line_t *tab = (annot_line_t*) data; - int nsmpl = bcf_hdr_nsamples(args->hdr_out); - assert( col->icol+nsmpl <= tab->ncols ); - int nvals = count_vals(tab,col->icol,col->icol+nsmpl); - assert( nvals>0 ); - hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi); - - int icol = col->icol, ismpl; - for (ismpl=0; ismpltmpi + ismpl*nvals; - int ival = 0; - - char *str = tab->cols[icol]; - while ( *str ) - { - if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value - { - ptr[ival++] = bcf_int32_missing; - str += str[1] ? 2 : 1; - continue; - } - - char *end = str; - ptr[ival] = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; - } - while ( ivalhdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals); -} -static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -{ - annot_line_t *tab = (annot_line_t*) data; - int nsmpl = bcf_hdr_nsamples(args->hdr_out); - assert( col->icol+nsmpl <= tab->ncols ); - int nvals = count_vals(tab,col->icol,col->icol+nsmpl); - assert( nvals>0 ); - hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf); - - int icol = col->icol, ismpl; - for (ismpl=0; ismpltmpf + ismpl*nvals; - int ival = 0; - - char *str = tab->cols[icol]; - while ( *str ) - { - if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value - { - bcf_float_set_missing(ptr[ival]); - ival++; - str += str[1] ? 2 : 1; - continue; - } - - char *end = str; - ptr[ival] = strtod(str, &end); - if ( end==str ) - error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; - } - while ( ivalhdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals); -} -static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -{ - annot_line_t *tab = (annot_line_t*) data; - int nsmpl = bcf_hdr_nsamples(args->hdr_out); - assert( col->icol+nsmpl <= tab->ncols ); - - int i, max_len = 0; - for (i=col->icol; iicol+nsmpl; i++) - { - int len = strlen(tab->cols[i]); - if ( max_len < len ) max_len = len; - } - hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps); - - int icol = col->icol, ismpl; - for (ismpl=0; ismpltmps + ismpl*max_len; - char *str = tab->cols[icol]; - i = 0; - while ( str[i] ) - { - ptr[i] = str[i]; - i++; - } - while ( ihdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len); -} -static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals) { - bcf1_t *rec = (bcf1_t*) data; - int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi); - if ( nsrc==-3 ) return 0; // the tag is not present - if ( nsrc<=0 ) return 1; // error - if ( !args->sample_map ) - return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot); - int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2); + int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2); if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); - nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) { - if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present - hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); + if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { - int32_t *dst = args->tmpi2 + nsrc*i; + int32_t *dst = args->tmpi2 + nvals*i; if ( args->sample_map[i]==-1 ) { dst[0] = bcf_int32_missing; - for (j=1; jtmpi + nsrc*args->sample_map[i]; - for (j=0; jsample_map[i]; + for (j=0; jhdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; - int32_t *src = args->tmpi + nsrc*args->sample_map[i]; + int32_t *src = vals + nvals*args->sample_map[i]; int32_t *dst = args->tmpi2 + ndst*i; - if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue; - if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue; - for (j=0; jreplace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } + else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } + for (j=0; jhdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out)); } - else // ndst < nsrc + else // ndst < nvals { - hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3); + hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3); for (i=0; ihdr_out); i++) { - int32_t *ori = args->tmpi2 + ndst*i; - int32_t *dst = args->tmpi3 + nsrc*i; - int keep_ori = 0; - if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1; - else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1; - if ( keep_ori ) + int32_t *ann = vals + nvals*args->sample_map[i]; + int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line + int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer + int use_new_ann = 1; + if ( args->sample_map[i]==-1 ) use_new_ann = 0; + else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; } + if ( !use_new_ann ) { for (j=0; jtmpi + nsrc*args->sample_map[i]; - for (j=0; jhdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out)); } } -static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals) { - bcf1_t *rec = (bcf1_t*) data; - int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf); - if ( nsrc==-3 ) return 0; // the tag is not present - if ( nsrc<=0 ) return 1; // error - if ( !args->sample_map ) - return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot); - int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2); + int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2); if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); - nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) { - if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present - hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2); + if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2); for (i=0; ihdr_out); i++) { - float *dst = args->tmpf2 + nsrc*i; + float *dst = args->tmpf2 + nvals*i; if ( args->sample_map[i]==-1 ) { bcf_float_set_missing(dst[0]); - for (j=1; jtmpf + nsrc*args->sample_map[i]; - for (j=0; jsample_map[i]; + for (j=0; jhdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; - float *src = args->tmpf + nsrc*args->sample_map[i]; + float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue; - if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue; - for (j=0; jreplace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } + for (j=0; jhdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out)); } - else // ndst < nsrc + else // ndst < nvals { - hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3); + hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3); for (i=0; ihdr_out); i++) { - float *ori = args->tmpf2 + ndst*i; - float *dst = args->tmpf3 + nsrc*i; - int keep_ori = 0; - if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1; - else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1; - if ( keep_ori ) + float *ann = vals + nvals*args->sample_map[i]; + float *ori = args->tmpf2 + ndst*i; // ori vcf line + float *dst = args->tmpf3 + nvals*i; // expanded buffer + int use_new_ann = 1; + if ( args->sample_map[i]==-1 ) use_new_ann = 0; + else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; } + if ( !use_new_ann ) { for (j=0; jtmpf + nsrc*args->sample_map[i]; - for (j=0; jhdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out)); } } -static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals) { - bcf1_t *rec = (bcf1_t*) data; - args->tmpp[0] = args->tmps; - int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps); - args->tmps = args->tmpp[0]; // tmps might be realloced - if ( ret==-3 ) return 0; // the tag is not present - if ( ret<=0 ) return 1; // error - if ( !args->sample_map ) - return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot); int i; args->tmpp2[0] = args->tmps2; - ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2); + int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2); args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced + int nsmpl = bcf_hdr_nsamples(args->hdr_out); if ( ret<=0 ) // not present in dst { hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2); - for (i=0; ihdr_out); i++) + char *tmp = args->tmps2; + for (i=0; itmps2[2*i] = '.'; - args->tmps2[2*i+1] = 0; - args->tmpp2[i] = args->tmps2+2*i; + tmp[0] = '.'; + tmp[1] = 0; + args->tmpp2[i] = tmp; + tmp += 2; } } + for (i=0; isample_map[i]==-1 ) continue; + char **src = vals + args->sample_map[i]; + char **dst = args->tmpp2 + i; + + if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } + else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } + *dst = *src; + } + return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl); +} +static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( col->icol+args->nsmpl_annot > tab->ncols ) + error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); + hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); + + int icol = col->icol, ismpl; + for (ismpl=0; ismplnsmpl_annot; ismpl++) + { + int32_t *ptr = args->tmpi + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + ptr[ival++] = bcf_int32_missing; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivaltmpi,nvals); +} +static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( col->icol+args->nsmpl_annot > tab->ncols ) + error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); + hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); - for (i=0; ihdr_out); i++) + int icol = col->icol, ismpl; + for (ismpl=0; ismplnsmpl_annot; ismpl++) { - int isrc = args->sample_map[i]; - if ( isrc==-1 ) continue; - args->tmpp2[i] = args->tmpp[isrc]; + float *ptr = args->tmpf + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + bcf_float_set_missing(ptr[ival]); + ival++; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivalhdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out)); + return core_setter_format_real(args,line,col,args->tmpf,nvals); } -static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples) +static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( col->icol+args->nsmpl_annot > tab->ncols ) + error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + + int ismpl; + for (ismpl=0; ismplnsmpl_annot; ismpl++) + args->tmpp[ismpl] = tab->cols[col->icol + ismpl]; + + return core_setter_format_str(args,line,col,args->tmpp); +} +static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + bcf1_t *rec = (bcf1_t*) data; + int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); + if ( nsrc==-3 ) return 0; // the tag is not present + if ( nsrc<=0 ) return 1; // error + return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header)); +} +static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + bcf1_t *rec = (bcf1_t*) data; + int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); + if ( nsrc==-3 ) return 0; // the tag is not present + if ( nsrc<=0 ) return 1; // error + return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header)); +} + +static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + bcf1_t *rec = (bcf1_t*) data; + args->tmpp[0] = args->tmps; + int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps); + args->tmps = args->tmpp[0]; // tmps might be realloced + if ( ret==-3 ) return 0; // the tag is not present + if ( ret<=0 ) return 1; // error + return core_setter_format_str(args,line,col,args->tmpp); +} +static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) { int i; if ( !args->sample_names ) { + args->nsmpl_annot = bcf_hdr_nsamples(dst); + + // tab annotation file, expecting that all samples are present: sample map not needed + if ( !src ) return 0; + int nmatch = 0, order_ok = 1; for (i=0; insample_map = bcf_hdr_nsamples(dst); args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); @@ -1146,46 +1166,70 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]); args->sample_map[i] = id; // idst -> isrc, -1 if not present } - return; + return 1; } args->nsample_map = bcf_hdr_nsamples(dst); args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); for (i=0; insample_map; i++) args->sample_map[i] = -1; - int nsamples = 0; - char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples); - for (i=0; isample_names, args->sample_is_file, SMPL_STRICT); + if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names); + char **samples = (char**) malloc(sizeof(char*)*ilist->n); + for (i=0; in; i++) samples[i] = strdup(dst->samples[i]); + args->nsmpl_annot = ilist->n; + smpl_ilist_destroy(ilist); + int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1; + if ( !src ) { - int isrc, idst; - char *ss = samples[i], *se = samples[i]; - while ( *se && !isspace(*se) ) se++; - if ( !*se ) + // tab annotation file + for (i=0; insmpl_annot; i++) + { + int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]); + if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]); + args->sample_map[idst] = i; + if ( idst!=i ) need_sample_map = 1; + } + } + else + { + // vcf annotation file + for (i=0; insmpl_annot; i++) { - // only one sample name + int isrc, idst; + char *ss = samples[i], *se = samples[i]; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) + { + // only one sample name + isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); + if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); + idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); + if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); + args->sample_map[idst] = isrc; + if ( idst!=isrc ) need_sample_map = 1; + continue; + } + *se = 0; isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); + + ss = se+1; + while ( isspace(*ss) ) ss++; + se = ss; + while ( *se && !isspace(*se) ) se++; + idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); + args->sample_map[idst] = isrc; - continue; + if ( idst!=isrc ) need_sample_map = 1; } - *se = 0; - isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); - if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); - - ss = se+1; - while ( isspace(*ss) ) ss++; - se = ss; - while ( *se && !isspace(*se) ) se++; - - idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); - if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); - - args->sample_map[idst] = isrc; } - for (i=0; insmpl_annot; i++) free(samples[i]); free(samples); + return need_sample_map; } static char *columns_complement(char *columns, void **skip_info, void **skip_fmt) { @@ -1247,8 +1291,27 @@ static char *columns_complement(char *columns, void **skip_info, void **skip_fmt free(columns); return str.s; } +static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str) +{ + int j, nout = 0; + ksprintf(str, "##%s=<", hrec->key); + for (j=0; jnkeys; j++) + { + if ( !strcmp("IDX",hrec->keys[j]) ) continue; + if ( nout ) kputc(',',str); + if ( !strcmp("ID", hrec->keys[j]) ) + ksprintf(str,"%s=%s", hrec->keys[j], tag); + else + ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]); + nout++; + } + ksprintf(str,">\n"); +} static void init_columns(args_t *args) { + int need_sample_map = 0; + int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr); + void *skip_fmt = NULL, *skip_info = NULL; if ( args->tgts_is_vcf ) args->columns = columns_complement(args->columns, &skip_info, &skip_fmt); @@ -1256,13 +1319,13 @@ static void init_columns(args_t *args) kstring_t str = {0,0,0}, tmp = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; - int icol = -1, has_fmt_str = 0, force_samples = -1; + int icol = -1, has_fmt_str = 0; while ( *ss ) { if ( *se && *se!=',' ) { se++; continue; } int replace = REPLACE_ALL; if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; } - else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; } + else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; } else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; } icol++; str.l = 0; @@ -1276,23 +1339,25 @@ static void init_columns(args_t *args) else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol; else if ( !strcasecmp("ID",str.s) ) { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); } else if ( !strcasecmp("FILTER",str.s) ) { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); if ( args->tgts_is_vcf ) { bcf_hdr_t *tgts_hdr = args->files->readers[1].header; @@ -1312,18 +1377,19 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("QUAL",str.s) ) { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); } else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); bcf_hdr_t *tgts_hdr = args->files->readers[1].header; int j; @@ -1343,7 +1409,8 @@ static void init_columns(args_t *args) annot_col_t *col = &args->cols[args->ncols-1]; col->icol = -1; col->replace = replace; - col->hdr_key = strdup(hrec->vals[k]); + col->hdr_key_src = strdup(hrec->vals[k]); + col->hdr_key_dst = strdup(hrec->vals[k]); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { @@ -1358,8 +1425,7 @@ static void init_columns(args_t *args) else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields { bcf_hdr_t *tgts_hdr = args->files->readers[1].header; - if ( force_samples<0 ) force_samples = replace; - if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace; + need_sample_map = 1; int j; for (j=0; jnhrec; j++) { @@ -1377,8 +1443,9 @@ static void init_columns(args_t *args) annot_col_t *col = &args->cols[args->ncols-1]; col->icol = -1; col->replace = replace; - col->hdr_key = strdup(hrec->vals[k]); - if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt; + col->hdr_key_src = strdup(hrec->vals[k]); + col->hdr_key_dst = strdup(hrec->vals[k]); + if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt; else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -1391,18 +1458,27 @@ static void init_columns(args_t *args) } else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) ) { - char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7); - if ( force_samples<0 ) force_samples = replace; - if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace; + char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7); + char *key_src = strstr(key_dst,":="); + if ( key_src ) + { + *key_src = 0; + key_src += 2; + if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7; + else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4; + } + else + key_src = key_dst; + need_sample_map = 1; if ( args->tgts_is_vcf ) { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL); + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL); tmp.l = 0; - bcf_hrec_format(hrec, &tmp); + bcf_hrec_format_rename(hrec, key_dst, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); bcf_hdr_sync(args->hdr_out); } - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key); + int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); @@ -1410,13 +1486,14 @@ static void init_columns(args_t *args) if ( !args->tgts_is_vcf ) { col->icol = icol; - icol += bcf_hdr_nsamples(args->hdr_out) - 1; + icol += args->nsmpl_annot - 1; } else col->icol = -1; col->replace = replace; - col->hdr_key = strdup(key); - if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt; + col->hdr_key_src = strdup(key_src); + col->hdr_key_dst = strdup(key_dst); + if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt; else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -1428,24 +1505,33 @@ static void init_columns(args_t *args) } else { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); - if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); } - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s); + char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s; + char *key_src = strstr(key_dst,":="); + if ( key_src ) + { + *key_src = 0; + key_src += 2; + if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5; + } + else + key_src = key_dst; + int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ) { if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL); + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname); tmp.l = 0; - bcf_hrec_format(hrec, &tmp); + bcf_hrec_format_rename(hrec, key_dst, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); bcf_hdr_sync(args->hdr_out); - hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s); + hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); + error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } @@ -1453,7 +1539,8 @@ static void init_columns(args_t *args) annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(key_src); + col->hdr_key_dst = strdup(key_dst); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { @@ -1480,8 +1567,13 @@ static void init_columns(args_t *args) args->tmpp = (char**)malloc(sizeof(char*)*n); args->tmpp2 = (char**)malloc(sizeof(char*)*n); } - if ( force_samples>=0 && args->tgts_is_vcf ) - set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1); + if ( !need_sample_map ) + { + free(args->sample_map); + args->sample_map = NULL; + } + else if ( sample_map_ok<0 ) + error("No matching samples in source and destination file?\n"); } static void rename_chrs(args_t *args, char *fname) @@ -1552,7 +1644,6 @@ static void init_data(args_t *args) if ( args->mark_sites ) { if ( !args->targets_fname ) error("The -a option not given\n"); - if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add.. bcf_hdr_printf(args->hdr_out,"##INFO=", args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } @@ -1564,7 +1655,8 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->n_threads ) + hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); bcf_hdr_write(args->out_fh, args->hdr_out); } } @@ -1577,7 +1669,10 @@ static void destroy_data(args_t *args) if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); if (args->vcmp) vcmp_destroy(args->vcmp); for (i=0; incols; i++) - free(args->cols[i].hdr_key); + { + free(args->cols[i].hdr_key_src); + free(args->cols[i].hdr_key_dst); + } free(args->cols); for (i=0; imalines; i++) { @@ -1718,7 +1813,7 @@ static void annotate(args_t *args, bcf1_t *line) // there is a matching line for (j=0; jncols; j++) if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) - error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); } @@ -1731,12 +1826,20 @@ static void annotate(args_t *args, bcf1_t *line) bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?0:1); } } - else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) ) + else if ( args->files->nreaders == 2 ) { - bcf1_t *aline = bcf_sr_get_line(args->files,1); - for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) - error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + if ( bcf_sr_has_line(args->files,1) ) + { + bcf1_t *aline = bcf_sr_get_line(args->files,1); + for (j=0; jncols; j++) + if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) + error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + + if ( args->mark_sites ) + bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0); + } + else if ( args->mark_sites ) + bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0); } if ( args->set_ids ) { @@ -1761,6 +1864,7 @@ static void usage(args_t *args) fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(stderr, " --collapse matching records by , see man page for details [some]\n"); fprintf(stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); @@ -1793,7 +1897,7 @@ int main_vcfannotate(int argc, char *argv[]) args->record_cmd_line = 1; args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; args->set_ids_replace = 1; - int regions_is_file = 0; + int regions_is_file = 0, collapse = 0; static struct option loptions[] = { @@ -1803,6 +1907,7 @@ int main_vcfannotate(int argc, char *argv[]) {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"annotations",required_argument,NULL,'a'}, + {"collapse",required_argument,NULL,2}, {"include",required_argument,NULL,'i'}, {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, @@ -1847,6 +1952,16 @@ int main_vcfannotate(int argc, char *argv[]) case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'h': args->header_fname = optarg; break; case 1 : args->rename_chrs = optarg; break; + case 2 : + if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; + else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; + else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; + else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; + else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; + else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; + else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; + else error("The --collapse string \"%s\" not recognised.\n", optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': usage(args); break; @@ -1877,9 +1992,10 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - args->files->collapse |= COLLAPSE_SOME; + args->files->collapse = collapse ? collapse : COLLAPSE_SOME; } } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index ea8398c..09f76c2 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -43,6 +44,7 @@ THE SOFTWARE. */ #include "vcmp.h" #include "filter.h" #include "convert.h" +#include "smpl_ilist.h" struct _args_t; @@ -67,12 +69,12 @@ annot_line_t; #define REPLACE_MISSING 0 // replace only missing values #define REPLACE_ALL 1 // replace both missing and existing values -#define REPLACE_EXISTING 2 // replace only if tgt is not missing +#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing #define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise typedef struct _annot_col_t { int icol, replace, number; // number: one of BCF_VL_* types - char *hdr_key; + char *hdr_key_src, *hdr_key_dst; int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); } annot_col_t; @@ -111,6 +113,7 @@ typedef struct _args_t convert_t *set_ids; int set_ids_replace; + int nsmpl_annot; int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc int mtmpi, mtmpf, mtmps; int mtmpi2, mtmpf2, mtmps2; @@ -157,6 +160,7 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag) } line->d.shared_dirty |= BCF1_DIRTY_INF; inf->vptr = NULL; + inf->vptr_off = inf->vptr_len = 0; } } void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag) @@ -189,6 +193,10 @@ void remove_format(args_t *args, bcf1_t *line, rm_tag_t *tag) } } +#include "htslib/khash.h" +KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +typedef khash_t(vdict) vdict_t; + static void remove_hdr_lines(bcf_hdr_t *hdr, int type) { int i = 0, nrm = 0; @@ -196,11 +204,18 @@ static void remove_hdr_lines(bcf_hdr_t *hdr, int type) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } bcf_hrec_t *hrec = hdr->hrec[i]; - if ( type==BCF_HL_FMT ) + if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) { // everything except FORMAT/GT int id = bcf_hrec_find_key(hrec, "ID"); - if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; } + if ( id>=0 ) + { + if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; } + vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; + khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]); + kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; + kh_val(d, k).info[type] |= 0xf; + } } nrm++; hdr->nhrec--; @@ -455,7 +470,7 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) line->qual = strtod(str, &str); if ( str == tab->cols[col->icol] ) - error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); return 0; } static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -472,31 +487,31 @@ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void * char *str = tab->cols[col->icol]; if ( str[0]=='.' && str[1]==0 ) return 0; - if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1); - if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0); + if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); + if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); return -1; } static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL); - bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag); + int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL); + bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag); return 0; } static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) { if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); if ( !map ) error("REF alleles not compatible at %s:%d\n"); // fill in any missing values in the target VCF (or all, if not present) - int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2); + int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2); int i; @@ -513,7 +528,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpi2[i] = args->tmpi[ map[i] ]; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst); + bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); return 0; } static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -539,17 +554,17 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2); + int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi); + bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); return 0; } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi); + int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); if ( ntmpi < 0 ) return 0; // nothing to add if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) @@ -557,26 +572,26 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2); + int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi); + bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); return 0; } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); if ( !map ) error("REF alleles not compatible at %s:%d\n"); // fill in any missing values in the target VCF (or all, if not present) - int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2); + int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2); int i; @@ -593,7 +608,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpf2[i] = args->tmpf[ map[i] ]; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst); + bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); return 0; } static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -619,17 +634,17 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2); + int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf); + bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); return 0; } static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf); + int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); if ( ntmpf < 0 ) return 0; // nothing to add if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) @@ -637,11 +652,11 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2); + int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf); + bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); return 0; } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c @@ -654,9 +669,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in lsrc++; } if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) - error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); @@ -664,7 +679,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in // fill in any missing values in the target VCF (or all, if not present) int i, empty = 0, nstr, mstr = args->tmpks.m; - nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr); + nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); args->tmpks.m = mstr; if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) ) { @@ -697,7 +712,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i); assert( ret==0 ); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s); + bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); return 0; } static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -714,17 +729,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2); + int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps); + bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); return 0; } static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps); + int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps); if ( ntmps < 0 ) return 0; // nothing to add if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) @@ -732,11 +747,11 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->replace==REPLACE_MISSING ) { - int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2); + int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps); + bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); return 0; } static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -754,7 +769,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) // field not present in dst file { - if ( col->replace==REPLACE_EXISTING ) return 0; + if ( col->replace==REPLACE_NON_MISSING ) return 0; hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { @@ -779,7 +794,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( args->sample_map[i]==-1 ) continue; int32_t *src = args->tmpi + nsrc*args->sample_map[i]; int32_t *dst = args->tmpi2 + ndst*i; - if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue; + if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue; if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue; for (j=0; jtmpi3 + nsrc*i; int keep_ori = 0; if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; + else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1; if ( keep_ori ) { @@ -813,7 +828,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo } static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) { - int i, nmax = 0; + int i, nmax = 1; for (i=icol_beg; icols[i], *end = str; @@ -833,298 +848,306 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) } return nmax; } -static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -{ - annot_line_t *tab = (annot_line_t*) data; - int nsmpl = bcf_hdr_nsamples(args->hdr_out); - assert( col->icol+nsmpl <= tab->ncols ); - int nvals = count_vals(tab,col->icol,col->icol+nsmpl); - assert( nvals>0 ); - hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi); - - int icol = col->icol, ismpl; - for (ismpl=0; ismpltmpi + ismpl*nvals; - int ival = 0; - - char *str = tab->cols[icol]; - while ( *str ) - { - if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value - { - ptr[ival++] = bcf_int32_missing; - str += str[1] ? 2 : 1; - continue; - } - - char *end = str; - ptr[ival] = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; - } - while ( ivalhdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals); -} -static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -{ - annot_line_t *tab = (annot_line_t*) data; - int nsmpl = bcf_hdr_nsamples(args->hdr_out); - assert( col->icol+nsmpl <= tab->ncols ); - int nvals = count_vals(tab,col->icol,col->icol+nsmpl); - assert( nvals>0 ); - hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf); - - int icol = col->icol, ismpl; - for (ismpl=0; ismpltmpf + ismpl*nvals; - int ival = 0; - - char *str = tab->cols[icol]; - while ( *str ) - { - if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value - { - bcf_float_set_missing(ptr[ival]); - ival++; - str += str[1] ? 2 : 1; - continue; - } - - char *end = str; - ptr[ival] = strtod(str, &end); - if ( end==str ) - error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; - } - while ( ivalhdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals); -} -static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -{ - annot_line_t *tab = (annot_line_t*) data; - int nsmpl = bcf_hdr_nsamples(args->hdr_out); - assert( col->icol+nsmpl <= tab->ncols ); - - int i, max_len = 0; - for (i=col->icol; iicol+nsmpl; i++) - { - int len = strlen(tab->cols[i]); - if ( max_len < len ) max_len = len; - } - hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps); - - int icol = col->icol, ismpl; - for (ismpl=0; ismpltmps + ismpl*max_len; - char *str = tab->cols[icol]; - i = 0; - while ( str[i] ) - { - ptr[i] = str[i]; - i++; - } - while ( ihdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len); -} -static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals) { - bcf1_t *rec = (bcf1_t*) data; - int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi); - if ( nsrc==-3 ) return 0; // the tag is not present - if ( nsrc<=0 ) return 1; // error - if ( !args->sample_map ) - return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot); - int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2); + int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2); if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); - nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) { - if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present - hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); + if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { - int32_t *dst = args->tmpi2 + nsrc*i; + int32_t *dst = args->tmpi2 + nvals*i; if ( args->sample_map[i]==-1 ) { dst[0] = bcf_int32_missing; - for (j=1; jtmpi + nsrc*args->sample_map[i]; - for (j=0; jsample_map[i]; + for (j=0; jhdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; - int32_t *src = args->tmpi + nsrc*args->sample_map[i]; + int32_t *src = vals + nvals*args->sample_map[i]; int32_t *dst = args->tmpi2 + ndst*i; - if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue; - if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue; - for (j=0; jreplace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } + else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } + for (j=0; jhdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out)); } - else // ndst < nsrc + else // ndst < nvals { - hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3); + hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3); for (i=0; ihdr_out); i++) { - int32_t *ori = args->tmpi2 + ndst*i; - int32_t *dst = args->tmpi3 + nsrc*i; - int keep_ori = 0; - if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1; - else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1; - if ( keep_ori ) + int32_t *ann = vals + nvals*args->sample_map[i]; + int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line + int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer + int use_new_ann = 1; + if ( args->sample_map[i]==-1 ) use_new_ann = 0; + else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; } + if ( !use_new_ann ) { for (j=0; jtmpi + nsrc*args->sample_map[i]; - for (j=0; jhdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out)); } } -static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals) { - bcf1_t *rec = (bcf1_t*) data; - int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf); - if ( nsrc==-3 ) return 0; // the tag is not present - if ( nsrc<=0 ) return 1; // error - if ( !args->sample_map ) - return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot); - int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2); + int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2); if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); - nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) { - if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present - hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2); + if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2); for (i=0; ihdr_out); i++) { - float *dst = args->tmpf2 + nsrc*i; + float *dst = args->tmpf2 + nvals*i; if ( args->sample_map[i]==-1 ) { bcf_float_set_missing(dst[0]); - for (j=1; jtmpf + nsrc*args->sample_map[i]; - for (j=0; jsample_map[i]; + for (j=0; jhdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; - float *src = args->tmpf + nsrc*args->sample_map[i]; + float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue; - if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue; - for (j=0; jreplace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } + for (j=0; jhdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out)); } - else // ndst < nsrc + else // ndst < nvals { - hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3); + hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3); for (i=0; ihdr_out); i++) { - float *ori = args->tmpf2 + ndst*i; - float *dst = args->tmpf3 + nsrc*i; - int keep_ori = 0; - if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1; - else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1; - if ( keep_ori ) + float *ann = vals + nvals*args->sample_map[i]; + float *ori = args->tmpf2 + ndst*i; // ori vcf line + float *dst = args->tmpf3 + nvals*i; // expanded buffer + int use_new_ann = 1; + if ( args->sample_map[i]==-1 ) use_new_ann = 0; + else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; } + if ( !use_new_ann ) { for (j=0; jtmpf + nsrc*args->sample_map[i]; - for (j=0; jhdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out)); } } -static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals) { - bcf1_t *rec = (bcf1_t*) data; - args->tmpp[0] = args->tmps; - int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps); - args->tmps = args->tmpp[0]; // tmps might be realloced - if ( ret==-3 ) return 0; // the tag is not present - if ( ret<=0 ) return 1; // error - if ( !args->sample_map ) - return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out)); + return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot); int i; args->tmpp2[0] = args->tmps2; - ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2); + int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2); args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced + int nsmpl = bcf_hdr_nsamples(args->hdr_out); if ( ret<=0 ) // not present in dst { hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2); - for (i=0; ihdr_out); i++) + char *tmp = args->tmps2; + for (i=0; itmps2[2*i] = '.'; - args->tmps2[2*i+1] = 0; - args->tmpp2[i] = args->tmps2+2*i; + tmp[0] = '.'; + tmp[1] = 0; + args->tmpp2[i] = tmp; + tmp += 2; } } + for (i=0; isample_map[i]==-1 ) continue; + char **src = vals + args->sample_map[i]; + char **dst = args->tmpp2 + i; + + if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } + else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } + *dst = *src; + } + return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl); +} +static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( col->icol+args->nsmpl_annot > tab->ncols ) + error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); + hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); + + int icol = col->icol, ismpl; + for (ismpl=0; ismplnsmpl_annot; ismpl++) + { + int32_t *ptr = args->tmpi + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + ptr[ival++] = bcf_int32_missing; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivaltmpi,nvals); +} +static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( col->icol+args->nsmpl_annot > tab->ncols ) + error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); + hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); - for (i=0; ihdr_out); i++) + int icol = col->icol, ismpl; + for (ismpl=0; ismplnsmpl_annot; ismpl++) { - int isrc = args->sample_map[i]; - if ( isrc==-1 ) continue; - args->tmpp2[i] = args->tmpp[isrc]; + float *ptr = args->tmpf + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + bcf_float_set_missing(ptr[ival]); + ival++; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivalhdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out)); + return core_setter_format_real(args,line,col,args->tmpf,nvals); } -static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples) +static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( col->icol+args->nsmpl_annot > tab->ncols ) + error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + + int ismpl; + for (ismpl=0; ismplnsmpl_annot; ismpl++) + args->tmpp[ismpl] = tab->cols[col->icol + ismpl]; + + return core_setter_format_str(args,line,col,args->tmpp); +} +static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + bcf1_t *rec = (bcf1_t*) data; + int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); + if ( nsrc==-3 ) return 0; // the tag is not present + if ( nsrc<=0 ) return 1; // error + return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header)); +} +static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + bcf1_t *rec = (bcf1_t*) data; + int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); + if ( nsrc==-3 ) return 0; // the tag is not present + if ( nsrc<=0 ) return 1; // error + return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header)); +} + +static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + bcf1_t *rec = (bcf1_t*) data; + args->tmpp[0] = args->tmps; + int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps); + args->tmps = args->tmpp[0]; // tmps might be realloced + if ( ret==-3 ) return 0; // the tag is not present + if ( ret<=0 ) return 1; // error + return core_setter_format_str(args,line,col,args->tmpp); +} +static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) { int i; if ( !args->sample_names ) { + args->nsmpl_annot = bcf_hdr_nsamples(dst); + + // tab annotation file, expecting that all samples are present: sample map not needed + if ( !src ) return 0; + int nmatch = 0, order_ok = 1; for (i=0; insample_map = bcf_hdr_nsamples(dst); args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); @@ -1148,46 +1168,70 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]); args->sample_map[i] = id; // idst -> isrc, -1 if not present } - return; + return 1; } args->nsample_map = bcf_hdr_nsamples(dst); args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); for (i=0; insample_map; i++) args->sample_map[i] = -1; - int nsamples = 0; - char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples); - for (i=0; isample_names, args->sample_is_file, SMPL_STRICT); + if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names); + char **samples = (char**) malloc(sizeof(char*)*ilist->n); + for (i=0; in; i++) samples[i] = strdup(dst->samples[i]); + args->nsmpl_annot = ilist->n; + smpl_ilist_destroy(ilist); + int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1; + if ( !src ) { - int isrc, idst; - char *ss = samples[i], *se = samples[i]; - while ( *se && !isspace(*se) ) se++; - if ( !*se ) + // tab annotation file + for (i=0; insmpl_annot; i++) + { + int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]); + if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]); + args->sample_map[idst] = i; + if ( idst!=i ) need_sample_map = 1; + } + } + else + { + // vcf annotation file + for (i=0; insmpl_annot; i++) { - // only one sample name + int isrc, idst; + char *ss = samples[i], *se = samples[i]; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) + { + // only one sample name + isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); + if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); + idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); + if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); + args->sample_map[idst] = isrc; + if ( idst!=isrc ) need_sample_map = 1; + continue; + } + *se = 0; isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); + + ss = se+1; + while ( isspace(*ss) ) ss++; + se = ss; + while ( *se && !isspace(*se) ) se++; + idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); + args->sample_map[idst] = isrc; - continue; + if ( idst!=isrc ) need_sample_map = 1; } - *se = 0; - isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); - if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); - - ss = se+1; - while ( isspace(*ss) ) ss++; - se = ss; - while ( *se && !isspace(*se) ) se++; - - idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); - if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); - - args->sample_map[idst] = isrc; } - for (i=0; insmpl_annot; i++) free(samples[i]); free(samples); + return need_sample_map; } static char *columns_complement(char *columns, void **skip_info, void **skip_fmt) { @@ -1249,8 +1293,27 @@ static char *columns_complement(char *columns, void **skip_info, void **skip_fmt free(columns); return str.s; } +static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str) +{ + int j, nout = 0; + ksprintf(str, "##%s=<", hrec->key); + for (j=0; jnkeys; j++) + { + if ( !strcmp("IDX",hrec->keys[j]) ) continue; + if ( nout ) kputc(',',str); + if ( !strcmp("ID", hrec->keys[j]) ) + ksprintf(str,"%s=%s", hrec->keys[j], tag); + else + ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]); + nout++; + } + ksprintf(str,">\n"); +} static void init_columns(args_t *args) { + int need_sample_map = 0; + int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr); + void *skip_fmt = NULL, *skip_info = NULL; if ( args->tgts_is_vcf ) args->columns = columns_complement(args->columns, &skip_info, &skip_fmt); @@ -1258,13 +1321,13 @@ static void init_columns(args_t *args) kstring_t str = {0,0,0}, tmp = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; - int icol = -1, has_fmt_str = 0, force_samples = -1; + int icol = -1, has_fmt_str = 0; while ( *ss ) { if ( *se && *se!=',' ) { se++; continue; } int replace = REPLACE_ALL; if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; } - else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; } + else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; } else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; } icol++; str.l = 0; @@ -1278,23 +1341,25 @@ static void init_columns(args_t *args) else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol; else if ( !strcasecmp("ID",str.s) ) { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); } else if ( !strcasecmp("FILTER",str.s) ) { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); if ( args->tgts_is_vcf ) { bcf_hdr_t *tgts_hdr = args->files->readers[1].header; @@ -1314,18 +1379,19 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("QUAL",str.s) ) { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); } else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); bcf_hdr_t *tgts_hdr = args->files->readers[1].header; int j; @@ -1345,7 +1411,8 @@ static void init_columns(args_t *args) annot_col_t *col = &args->cols[args->ncols-1]; col->icol = -1; col->replace = replace; - col->hdr_key = strdup(hrec->vals[k]); + col->hdr_key_src = strdup(hrec->vals[k]); + col->hdr_key_dst = strdup(hrec->vals[k]); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { @@ -1360,8 +1427,7 @@ static void init_columns(args_t *args) else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields { bcf_hdr_t *tgts_hdr = args->files->readers[1].header; - if ( force_samples<0 ) force_samples = replace; - if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace; + need_sample_map = 1; int j; for (j=0; jnhrec; j++) { @@ -1379,8 +1445,9 @@ static void init_columns(args_t *args) annot_col_t *col = &args->cols[args->ncols-1]; col->icol = -1; col->replace = replace; - col->hdr_key = strdup(hrec->vals[k]); - if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt; + col->hdr_key_src = strdup(hrec->vals[k]); + col->hdr_key_dst = strdup(hrec->vals[k]); + if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt; else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -1393,18 +1460,27 @@ static void init_columns(args_t *args) } else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) ) { - char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7); - if ( force_samples<0 ) force_samples = replace; - if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace; + char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7); + char *key_src = strstr(key_dst,":="); + if ( key_src ) + { + *key_src = 0; + key_src += 2; + if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7; + else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4; + } + else + key_src = key_dst; + need_sample_map = 1; if ( args->tgts_is_vcf ) { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL); + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL); tmp.l = 0; - bcf_hrec_format(hrec, &tmp); + bcf_hrec_format_rename(hrec, key_dst, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); bcf_hdr_sync(args->hdr_out); } - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key); + int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); @@ -1412,13 +1488,14 @@ static void init_columns(args_t *args) if ( !args->tgts_is_vcf ) { col->icol = icol; - icol += bcf_hdr_nsamples(args->hdr_out) - 1; + icol += args->nsmpl_annot - 1; } else col->icol = -1; col->replace = replace; - col->hdr_key = strdup(key); - if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt; + col->hdr_key_src = strdup(key_src); + col->hdr_key_dst = strdup(key_dst); + if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt; else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -1430,24 +1507,33 @@ static void init_columns(args_t *args) } else { - if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); - if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); } - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s); + char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s; + char *key_src = strstr(key_dst,":="); + if ( key_src ) + { + *key_src = 0; + key_src += 2; + if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5; + } + else + key_src = key_dst; + int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ) { if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL); + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname); tmp.l = 0; - bcf_hrec_format(hrec, &tmp); + bcf_hrec_format_rename(hrec, key_dst, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); bcf_hdr_sync(args->hdr_out); - hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s); + hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); + error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } @@ -1455,7 +1541,8 @@ static void init_columns(args_t *args) annot_col_t *col = &args->cols[args->ncols-1]; col->icol = icol; col->replace = replace; - col->hdr_key = strdup(str.s); + col->hdr_key_src = strdup(key_src); + col->hdr_key_dst = strdup(key_dst); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { @@ -1482,8 +1569,13 @@ static void init_columns(args_t *args) args->tmpp = (char**)malloc(sizeof(char*)*n); args->tmpp2 = (char**)malloc(sizeof(char*)*n); } - if ( force_samples>=0 && args->tgts_is_vcf ) - set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1); + if ( !need_sample_map ) + { + free(args->sample_map); + args->sample_map = NULL; + } + else if ( sample_map_ok<0 ) + error("No matching samples in source and destination file?\n"); } static void rename_chrs(args_t *args, char *fname) @@ -1554,7 +1646,6 @@ static void init_data(args_t *args) if ( args->mark_sites ) { if ( !args->targets_fname ) error("The -a option not given\n"); - if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add.. bcf_hdr_printf(args->hdr_out,"##INFO=", args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } @@ -1566,7 +1657,8 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->n_threads ) + hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); bcf_hdr_write(args->out_fh, args->hdr_out); } } @@ -1579,7 +1671,10 @@ static void destroy_data(args_t *args) if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); if (args->vcmp) vcmp_destroy(args->vcmp); for (i=0; incols; i++) - free(args->cols[i].hdr_key); + { + free(args->cols[i].hdr_key_src); + free(args->cols[i].hdr_key_dst); + } free(args->cols); for (i=0; imalines; i++) { @@ -1720,7 +1815,7 @@ static void annotate(args_t *args, bcf1_t *line) // there is a matching line for (j=0; jncols; j++) if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) - error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); } @@ -1733,12 +1828,20 @@ static void annotate(args_t *args, bcf1_t *line) bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?0:1); } } - else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) ) + else if ( args->files->nreaders == 2 ) { - bcf1_t *aline = bcf_sr_get_line(args->files,1); - for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) - error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1); + if ( bcf_sr_has_line(args->files,1) ) + { + bcf1_t *aline = bcf_sr_get_line(args->files,1); + for (j=0; jncols; j++) + if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) + error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); + + if ( args->mark_sites ) + bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0); + } + else if ( args->mark_sites ) + bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0); } if ( args->set_ids ) { @@ -1763,6 +1866,7 @@ static void usage(args_t *args) fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "Options:\n"); fprintf(pysam_stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(pysam_stderr, " --collapse matching records by , see man page for details [some]\n"); fprintf(pysam_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(pysam_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); @@ -1795,7 +1899,7 @@ int main_vcfannotate(int argc, char *argv[]) args->record_cmd_line = 1; args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; args->set_ids_replace = 1; - int regions_is_file = 0; + int regions_is_file = 0, collapse = 0; static struct option loptions[] = { @@ -1805,6 +1909,7 @@ int main_vcfannotate(int argc, char *argv[]) {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"annotations",required_argument,NULL,'a'}, + {"collapse",required_argument,NULL,2}, {"include",required_argument,NULL,'i'}, {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, @@ -1849,6 +1954,16 @@ int main_vcfannotate(int argc, char *argv[]) case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'h': args->header_fname = optarg; break; case 1 : args->rename_chrs = optarg; break; + case 2 : + if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; + else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; + else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; + else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; + else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; + else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; + else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; + else error("The --collapse string \"%s\" not recognised.\n", optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': usage(args); break; @@ -1879,9 +1994,10 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - args->files->collapse |= COLLAPSE_SOME; + args->files->collapse = collapse ? collapse : COLLAPSE_SOME; } } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index e5bbf11..00771f7 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -24,6 +24,7 @@ THE SOFTWARE. */ #include #include +#include #include #include #include @@ -146,7 +147,7 @@ static ploidy_predef_t ploidy_predefs[] = "* * * F 2\n" }, { .alias = "GRCh38", - .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)", + .about = "Human Genome reference assembly GRCh38 / hg38", .ploidy = "X 1 9999 M 1\n" "X 2781480 155701381 M 1\n" @@ -275,7 +276,7 @@ static void set_samples(args_t *args, const char *fn, int is_file) args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); - int dflt_sex_id = ploidy_add_sex(args->ploidy, "F"); + int dflt_sex_id = ploidy_nsex(args->ploidy) - 1; for (i=0; iaux.hdr); i++) args->sample2sex[i] = dflt_sex_id; int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); @@ -294,6 +295,7 @@ static void set_samples(args_t *args, const char *fn, int is_file) int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss); if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } ss = se+1; while ( *ss && isspace(*ss) ) ss++; @@ -411,18 +413,24 @@ static void init_data(args_t *args) { args->nsamples = bcf_hdr_nsamples(args->aux.hdr); args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples); - for (i=0; insamples; i++) args->sample2sex[i] = 0; + for (i=0; insamples; i++) args->sample2sex[i] = args->nsex - 1; } } if ( args->nsamples ) { args->aux.ploidy = (uint8_t*) malloc(args->nsamples); - for (i=0; insamples; i++) args->aux.ploidy[i] = 2; - for (i=0; insex; i++) args->sex2ploidy_prev[i] = 2; + for (i=0; insamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy); + for (i=0; insex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy); + for (i=0; insamples; i++) + if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1; } - if ( args->gvcf ) + if ( args->gvcf ) + { + int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP"); + if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n"); gvcf_update_header(args->gvcf, args->aux.hdr); + } if ( args->samples_map ) { @@ -554,7 +562,6 @@ static void set_ploidy(args_t *args, bcf1_t *rec) else args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]]; } - int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp; } @@ -569,7 +576,10 @@ ploidy_t *init_ploidy(char *alias) if ( !pld->alias ) { - fprintf(stderr,"Predefined ploidies:\n"); + fprintf(stderr,"\nPRE-DEFINED PLOIDY FILES\n\n"); + fprintf(stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(stderr," * Coordinates are 1-based inclusive.\n"); + fprintf(stderr," * A '*' means any value not otherwise defined.\n\n"); pld = ploidy_predefs; while ( pld->alias ) { @@ -618,6 +628,7 @@ static void usage(args_t *args) fprintf(stderr, "Input/output options:\n"); fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); fprintf(stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); + fprintf(stderr, " -F, --prior-freqs use prior allele frequencies\n"); fprintf(stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); @@ -630,7 +641,7 @@ static void usage(args_t *args) fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); fprintf(stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); fprintf(stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity) [1.1e-3]\n"); + fprintf(stderr, " -P, --prior mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); // todo (and more) // fprintf(stderr, "\nContrast calling and association test options:\n"); @@ -667,6 +678,7 @@ int main_vcfcall(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"format-fields",required_argument,NULL,'f'}, + {"prior-freqs",required_argument,NULL,'F'}, {"gvcf",required_argument,NULL,'g'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, @@ -698,7 +710,7 @@ int main_vcfcall(int argc, char *argv[]) }; char *tmp = NULL; - while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0) { switch (c) { @@ -713,6 +725,13 @@ int main_vcfcall(int argc, char *argv[]) case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method case 'i': args.flag |= CF_INS_MISSED; break; case 'v': args.aux.flag |= CALL_VARONLY; break; + case 'F': + args.aux.prior_AN = optarg; + args.aux.prior_AC = strchr(optarg,','); + if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg); + *args.aux.prior_AC = 0; + args.aux.prior_AC++; + break; case 'g': args.gvcf = gvcf_init(optarg); if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg); @@ -770,8 +789,8 @@ int main_vcfcall(int argc, char *argv[]) if ( !ploidy_fname && !ploidy ) { - fprintf(stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n"); - args.ploidy = ploidy_init_string("",2); + if ( !args.samples_is_file ) fprintf(stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n"); + args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2); } if ( !args.ploidy ) error("Could not initialize ploidy\n"); @@ -833,6 +852,7 @@ int main_vcfcall(int argc, char *argv[]) else ret = ccall(&args.aux, bcf_rec); if ( ret==-1 ) error("Something is wrong\n"); + else if ( ret==-2 ) continue; // skip the site // Normal output if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 8e59fd9..8e6721b 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -26,6 +26,7 @@ THE SOFTWARE. */ #include #include +#include #include #include #include @@ -148,7 +149,7 @@ static ploidy_predef_t ploidy_predefs[] = "* * * F 2\n" }, { .alias = "GRCh38", - .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)", + .about = "Human Genome reference assembly GRCh38 / hg38", .ploidy = "X 1 9999 M 1\n" "X 2781480 155701381 M 1\n" @@ -277,7 +278,7 @@ static void set_samples(args_t *args, const char *fn, int is_file) args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); - int dflt_sex_id = ploidy_add_sex(args->ploidy, "F"); + int dflt_sex_id = ploidy_nsex(args->ploidy) - 1; for (i=0; iaux.hdr); i++) args->sample2sex[i] = dflt_sex_id; int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); @@ -296,6 +297,7 @@ static void set_samples(args_t *args, const char *fn, int is_file) int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss); if ( ismpl < 0 ) { fprintf(pysam_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + if ( old2new[ismpl] != -1 ) { fprintf(pysam_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } ss = se+1; while ( *ss && isspace(*ss) ) ss++; @@ -413,18 +415,24 @@ static void init_data(args_t *args) { args->nsamples = bcf_hdr_nsamples(args->aux.hdr); args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples); - for (i=0; insamples; i++) args->sample2sex[i] = 0; + for (i=0; insamples; i++) args->sample2sex[i] = args->nsex - 1; } } if ( args->nsamples ) { args->aux.ploidy = (uint8_t*) malloc(args->nsamples); - for (i=0; insamples; i++) args->aux.ploidy[i] = 2; - for (i=0; insex; i++) args->sex2ploidy_prev[i] = 2; + for (i=0; insamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy); + for (i=0; insex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy); + for (i=0; insamples; i++) + if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1; } - if ( args->gvcf ) + if ( args->gvcf ) + { + int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP"); + if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n"); gvcf_update_header(args->gvcf, args->aux.hdr); + } if ( args->samples_map ) { @@ -556,7 +564,6 @@ static void set_ploidy(args_t *args, bcf1_t *rec) else args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]]; } - int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp; } @@ -571,7 +578,10 @@ ploidy_t *init_ploidy(char *alias) if ( !pld->alias ) { - fprintf(pysam_stderr,"Predefined ploidies:\n"); + fprintf(pysam_stderr,"\nPRE-DEFINED PLOIDY FILES\n\n"); + fprintf(pysam_stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(pysam_stderr," * Coordinates are 1-based inclusive.\n"); + fprintf(pysam_stderr," * A '*' means any value not otherwise defined.\n\n"); pld = ploidy_predefs; while ( pld->alias ) { @@ -620,6 +630,7 @@ static void usage(args_t *args) fprintf(pysam_stderr, "Input/output options:\n"); fprintf(pysam_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); fprintf(pysam_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); + fprintf(pysam_stderr, " -F, --prior-freqs use prior allele frequencies\n"); fprintf(pysam_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); fprintf(pysam_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); fprintf(pysam_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); @@ -632,7 +643,7 @@ static void usage(args_t *args) fprintf(pysam_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); fprintf(pysam_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); fprintf(pysam_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity) [1.1e-3]\n"); + fprintf(pysam_stderr, " -P, --prior mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); // todo (and more) // fprintf(pysam_stderr, "\nContrast calling and association test options:\n"); @@ -669,6 +680,7 @@ int main_vcfcall(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"format-fields",required_argument,NULL,'f'}, + {"prior-freqs",required_argument,NULL,'F'}, {"gvcf",required_argument,NULL,'g'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, @@ -700,7 +712,7 @@ int main_vcfcall(int argc, char *argv[]) }; char *tmp = NULL; - while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0) { switch (c) { @@ -715,6 +727,13 @@ int main_vcfcall(int argc, char *argv[]) case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method case 'i': args.flag |= CF_INS_MISSED; break; case 'v': args.aux.flag |= CALL_VARONLY; break; + case 'F': + args.aux.prior_AN = optarg; + args.aux.prior_AC = strchr(optarg,','); + if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg); + *args.aux.prior_AC = 0; + args.aux.prior_AC++; + break; case 'g': args.gvcf = gvcf_init(optarg); if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg); @@ -772,8 +791,8 @@ int main_vcfcall(int argc, char *argv[]) if ( !ploidy_fname && !ploidy ) { - fprintf(pysam_stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n"); - args.ploidy = ploidy_init_string("",2); + if ( !args.samples_is_file ) fprintf(pysam_stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n"); + args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2); } if ( !args.ploidy ) error("Could not initialize ploidy\n"); @@ -835,6 +854,7 @@ int main_vcfcall(int argc, char *argv[]) else ret = ccall(&args.aux, bcf_rec); if ( ret==-1 ) error("Something is wrong\n"); + else if ( ret==-2 ) continue; // skip the site // Normal output if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index e4b9372..ffe71c4 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -266,17 +266,15 @@ static void init_data(args_t *args) hmm_init_states(args->hmm, args->iprobs); args->summary_fh = stdout; - if ( args->output_dir ) + init_sample_files(&args->query_sample, args->output_dir); + if ( args->control_sample.name ) { - init_sample_files(&args->query_sample, args->output_dir); - if ( args->control_sample.name ) - { - init_sample_files(&args->control_sample, args->output_dir); - args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir); - } - else - args->summary_fh = NULL; // one sample only, no two-file summary + init_sample_files(&args->control_sample, args->output_dir); + args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir); } + else + args->summary_fh = NULL; // one sample only, no two-file summary + int i; FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh; @@ -295,6 +293,19 @@ static void init_data(args_t *args) "# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n", args->query_sample.name ); + if ( args->optimize_frac ) + { + fprintf(args->query_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n"); + if ( args->control_sample.name ) + { + fprintf(args->control_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n"); + fprintf(args->summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t" + "[5]Cell fraction:%s\t[6]Cell fraction:%s\t[7]BAF deviation:%s\t[8]BAF deviation:%s\n", + args->query_sample.name,args->control_sample.name, + args->query_sample.name,args->control_sample.name + ); + } + } } char *msprintf(const char *fmt, ...); @@ -556,6 +567,7 @@ static void destroy_data(args_t *args) free(args->sites); free(args->eprob); free(args->tprob); + free(args->iprobs); free(args->summary_fname); free(args->nonref_afs); free(args->query_sample.baf); @@ -960,6 +972,20 @@ static void cnv_flush_viterbi(args_t *args) if ( args->control_sample.name ) fprintf(stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); fprintf(stderr,"\n"); + + fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n", + bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, + args->query_sample.cell_frac,sqrt(args->query_sample.baf_dev2)); + if ( args->control_sample.name ) + { + fprintf(args->control_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n", + bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, + args->control_sample.cell_frac,sqrt(args->control_sample.baf_dev2)); + fprintf(args->summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%.2f\t%f\t%f\n", + bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, + args->query_sample.cell_frac, args->control_sample.cell_frac, + sqrt(args->query_sample.baf_dev2), sqrt(args->control_sample.baf_dev2)); + } } set_emission_probs(args); @@ -1351,7 +1377,7 @@ int main_vcfcnv(int argc, char *argv[]) else fname = argv[optind]; if ( !fname ) usage(args); - if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n"); + if ( !args->output_dir ) error("Expected -o option\n"); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index 10a00b9..1075ef1 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -268,17 +268,15 @@ static void init_data(args_t *args) hmm_init_states(args->hmm, args->iprobs); args->summary_fh = pysam_stdout; - if ( args->output_dir ) + init_sample_files(&args->query_sample, args->output_dir); + if ( args->control_sample.name ) { - init_sample_files(&args->query_sample, args->output_dir); - if ( args->control_sample.name ) - { - init_sample_files(&args->control_sample, args->output_dir); - args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir); - } - else - args->summary_fh = NULL; // one sample only, no two-file summary + init_sample_files(&args->control_sample, args->output_dir); + args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir); } + else + args->summary_fh = NULL; // one sample only, no two-file summary + int i; FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh; @@ -297,6 +295,19 @@ static void init_data(args_t *args) "# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n", args->query_sample.name ); + if ( args->optimize_frac ) + { + fprintf(args->query_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n"); + if ( args->control_sample.name ) + { + fprintf(args->control_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n"); + fprintf(args->summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t" + "[5]Cell fraction:%s\t[6]Cell fraction:%s\t[7]BAF deviation:%s\t[8]BAF deviation:%s\n", + args->query_sample.name,args->control_sample.name, + args->query_sample.name,args->control_sample.name + ); + } + } } char *msprintf(const char *fmt, ...); @@ -558,6 +569,7 @@ static void destroy_data(args_t *args) free(args->sites); free(args->eprob); free(args->tprob); + free(args->iprobs); free(args->summary_fname); free(args->nonref_afs); free(args->query_sample.baf); @@ -962,6 +974,20 @@ static void cnv_flush_viterbi(args_t *args) if ( args->control_sample.name ) fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); fprintf(pysam_stderr,"\n"); + + fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n", + bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, + args->query_sample.cell_frac,sqrt(args->query_sample.baf_dev2)); + if ( args->control_sample.name ) + { + fprintf(args->control_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n", + bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, + args->control_sample.cell_frac,sqrt(args->control_sample.baf_dev2)); + fprintf(args->summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%.2f\t%f\t%f\n", + bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, + args->query_sample.cell_frac, args->control_sample.cell_frac, + sqrt(args->query_sample.baf_dev2), sqrt(args->control_sample.baf_dev2)); + } } set_emission_probs(args); @@ -1353,7 +1379,7 @@ int main_vcfcnv(int argc, char *argv[]) else fname = argv[optind]; if ( !fname ) usage(args); - if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n"); + if ( !args->output_dir ) error("Expected -o option\n"); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index bd6a00a..3345c20 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -555,100 +555,138 @@ static void concat(args_t *args) } } +int print_vcf_gz_header(BGZF *fp, BGZF *bgzf_out, int print_header, kstring_t *tmp) +{ + char *buffer = (char*) fp->uncompressed_block; + + // Read the header and find the position of the data block + if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); + + int nskip = 1; // end of the header in the current uncompressed block + while (1) + { + if ( buffer[nskip]=='\n' ) + { + nskip++; + if ( nskip>=fp->block_length ) + { + kputsn(buffer,nskip,tmp); + if ( bgzf_read_block(fp) != 0 ) return -1; + if ( !fp->block_length ) break; + nskip = 0; + } + // The header has finished + if ( buffer[nskip]!='#' ) + { + kputsn(buffer,nskip,tmp); + break; + } + } + nskip++; + if ( nskip>=fp->block_length ) + { + kputsn(buffer,fp->block_length,tmp); + if ( bgzf_read_block(fp) != 0 ) return -1; + if ( !fp->block_length ) break; + nskip = 0; + } + } + if ( print_header ) + { + if ( bgzf_write(bgzf_out,tmp->s,tmp->l) != tmp->l ) error("Failed to write %d bytes\n", tmp->l); + tmp->l = 0; + } + return nskip; +} + +static inline int unpackInt16(const uint8_t *buffer) +{ + return buffer[0] | buffer[1] << 8; +} +static int check_header(const uint8_t *header) +{ + if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2; + return ((header[3] & 4) != 0 + && unpackInt16((uint8_t*)&header[10]) == 6 + && header[12] == 'B' && header[13] == 'C' + && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; +} static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; - const size_t page_size = 32768; - char *buf = (char*) malloc(page_size); + const size_t page_size = BGZF_MAX_BLOCK_SIZE; + uint8_t *buf = (uint8_t*) malloc(page_size); kstring_t tmp = {0,0,0}; - int i; + int i, file_types = 0; for (i=0; infnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); - if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); - if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); + if ( type.compression!=bgzf ) + error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); + file_types |= type.format==vcf ? 1 : 2; + if ( file_types==3 ) + error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); - uint8_t magic[5]; - if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); - if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); + int nskip; + if ( type.format==bcf ) + { + uint8_t magic[5]; + if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); - if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); - hts_expand(char,tmp.l,tmp.m,tmp.s); - if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + hts_expand(char,tmp.l,tmp.m,tmp.s); + if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); - // write only the first header - if ( i==0 ) + // write only the first header + if ( i==0 ) + { + if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); + if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); + if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); + } + nskip = fp->block_offset; + } + else { - if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); - if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); - if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); + nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp); + if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]); } // Output all non-header data that were read together with the header block - int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { - if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); + if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks - ssize_t nread, ncached = 0, nwr; - const int neof = 28; - char cached[neof]; + // The final bgzf eof block will be added by bgzf_close. + ssize_t nread, nblock, nwr; + const int nheader = 18, neof = 28; + const uint8_t *eof = (uint8_t*) "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; while (1) { - nread = bgzf_raw_read(fp, buf, page_size); - - // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends - if ( nread<=0 ) break; - if ( nread<=neof ) // last block - { - if ( ncached ) - { - // flush the part of the cache that won't be needed - nwr = bgzf_raw_write(bgzf_out, cached, nread); - if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); - - // make space in the cache so that we can append to the end - if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); - } - - // fill the cache and check for eof outside this loop - memcpy(cached+neof-nread,buf,nread); - break; - } - - // not the last block, flush the cache if full - if ( ncached ) - { - nwr = bgzf_raw_write(bgzf_out, cached, ncached); - if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); - ncached = 0; - } - - // fill the cache - nread -= neof; - memcpy(cached,buf+nread,neof); - ncached = neof; - + nread = bgzf_raw_read(fp, buf, nheader); + if ( !nread ) break; + if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]); + nblock = unpackInt16(buf+16) + 1; + assert( nblock <= page_size && nblock >= nheader ); + nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader); + if ( nread!=nblock ) error("Could not read %d bytes: %s\n",nblock,args->fnames[i]); + if ( nread==neof && !memcmp(buf,eof,neof) ) continue; nwr = bgzf_raw_write(bgzf_out, buf, nread); - if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); - } - if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) - { - nwr = bgzf_raw_write(bgzf_out, cached, neof); - if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); + if ( nwr != nread ) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } @@ -677,8 +715,8 @@ static void usage(args_t *args) fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n"); fprintf(stderr, " -f, --file-list Read the list of files from a file.\n"); fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); fprintf(stderr, " -o, --output Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index be2d6a2..4445a51 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -557,100 +557,138 @@ static void concat(args_t *args) } } +int print_vcf_gz_header(BGZF *fp, BGZF *bgzf_out, int print_header, kstring_t *tmp) +{ + char *buffer = (char*) fp->uncompressed_block; + + // Read the header and find the position of the data block + if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); + + int nskip = 1; // end of the header in the current uncompressed block + while (1) + { + if ( buffer[nskip]=='\n' ) + { + nskip++; + if ( nskip>=fp->block_length ) + { + kputsn(buffer,nskip,tmp); + if ( bgzf_read_block(fp) != 0 ) return -1; + if ( !fp->block_length ) break; + nskip = 0; + } + // The header has finished + if ( buffer[nskip]!='#' ) + { + kputsn(buffer,nskip,tmp); + break; + } + } + nskip++; + if ( nskip>=fp->block_length ) + { + kputsn(buffer,fp->block_length,tmp); + if ( bgzf_read_block(fp) != 0 ) return -1; + if ( !fp->block_length ) break; + nskip = 0; + } + } + if ( print_header ) + { + if ( bgzf_write(bgzf_out,tmp->s,tmp->l) != tmp->l ) error("Failed to write %d bytes\n", tmp->l); + tmp->l = 0; + } + return nskip; +} + +static inline int unpackInt16(const uint8_t *buffer) +{ + return buffer[0] | buffer[1] << 8; +} +static int check_header(const uint8_t *header) +{ + if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2; + return ((header[3] & 4) != 0 + && unpackInt16((uint8_t*)&header[10]) == 6 + && header[12] == 'B' && header[13] == 'C' + && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; +} static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; - const size_t page_size = 32768; - char *buf = (char*) malloc(page_size); + const size_t page_size = BGZF_MAX_BLOCK_SIZE; + uint8_t *buf = (uint8_t*) malloc(page_size); kstring_t tmp = {0,0,0}; - int i; + int i, file_types = 0; for (i=0; infnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); - if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); - if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); + if ( type.compression!=bgzf ) + error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); + file_types |= type.format==vcf ? 1 : 2; + if ( file_types==3 ) + error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); - uint8_t magic[5]; - if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); - if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); + int nskip; + if ( type.format==bcf ) + { + uint8_t magic[5]; + if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); - if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); - hts_expand(char,tmp.l,tmp.m,tmp.s); - if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + hts_expand(char,tmp.l,tmp.m,tmp.s); + if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); - // write only the first header - if ( i==0 ) + // write only the first header + if ( i==0 ) + { + if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); + if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); + if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); + } + nskip = fp->block_offset; + } + else { - if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); - if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); - if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); + nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp); + if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]); } // Output all non-header data that were read together with the header block - int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { - if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); + if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks - ssize_t nread, ncached = 0, nwr; - const int neof = 28; - char cached[neof]; + // The final bgzf eof block will be added by bgzf_close. + ssize_t nread, nblock, nwr; + const int nheader = 18, neof = 28; + const uint8_t *eof = (uint8_t*) "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; while (1) { - nread = bgzf_raw_read(fp, buf, page_size); - - // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends - if ( nread<=0 ) break; - if ( nread<=neof ) // last block - { - if ( ncached ) - { - // flush the part of the cache that won't be needed - nwr = bgzf_raw_write(bgzf_out, cached, nread); - if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); - - // make space in the cache so that we can append to the end - if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); - } - - // fill the cache and check for eof outside this loop - memcpy(cached+neof-nread,buf,nread); - break; - } - - // not the last block, flush the cache if full - if ( ncached ) - { - nwr = bgzf_raw_write(bgzf_out, cached, ncached); - if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); - ncached = 0; - } - - // fill the cache - nread -= neof; - memcpy(cached,buf+nread,neof); - ncached = neof; - + nread = bgzf_raw_read(fp, buf, nheader); + if ( !nread ) break; + if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]); + nblock = unpackInt16(buf+16) + 1; + assert( nblock <= page_size && nblock >= nheader ); + nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader); + if ( nread!=nblock ) error("Could not read %d bytes: %s\n",nblock,args->fnames[i]); + if ( nread==neof && !memcmp(buf,eof,neof) ) continue; nwr = bgzf_raw_write(bgzf_out, buf, nread); - if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); - } - if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) - { - nwr = bgzf_raw_write(bgzf_out, cached, neof); - if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); + if ( nwr != nread ) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } @@ -679,8 +717,8 @@ static void usage(args_t *args) fprintf(pysam_stderr, " -D, --remove-duplicates Alias for -d none\n"); fprintf(pysam_stderr, " -f, --file-list Read the list of files from a file.\n"); fprintf(pysam_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n"); + fprintf(pysam_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); fprintf(pysam_stderr, " -o, --output Write output to a file [standard output]\n"); fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(pysam_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index 1e60d30..f650bea 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -65,7 +66,7 @@ struct _args_t int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; - char *outfname, *infname, *ref_fname; + char *outfname, *infname, *ref_fname, *sex_fname; int argc, n_threads, record_cmd_line; }; @@ -81,6 +82,9 @@ static void destroy_data(args_t *args) static void open_vcf(args_t *args, const char *format_str) { args->files = bcf_sr_init(); + if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 ) + error("Could not initialize --threads %d\n", args->n_threads); + if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) @@ -129,9 +133,6 @@ static void open_vcf(args_t *args, const char *format_str) } if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str); free(samples); - - if ( args->filter_str ) - args->filter = filter_init(args->header, args->filter_str); } static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) @@ -373,6 +374,7 @@ static void gensample_to_vcf(args_t *args) int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); + if ( !samples ) error("Could not read %s\n", sample_fname); for (i=2; in.total); } +char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) +{ + int i, nlines; + char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1); + char **lines = hts_readlist(sex_fname, 1, &nlines); + if ( !lines ) error("Could not read %s\n", sex_fname); + for (i=0; ioutfname,&str); - int n_files, i; + int n_files = 0, i; char **files = hts_readlist(str.s, 0, &n_files); if ( n_files==1 ) { @@ -712,22 +742,30 @@ static void vcf_to_gensample(args_t *args) if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname); // write samples file - if (sample_fname) { + if (sample_fname) + { + char *sample2sex = NULL; + if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; - kputs("ID_1 ID_2 missing\n0 0 0\n", &str); + kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); for (i=0; iheader); i++) { str.l = 0; - ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]); + if ( sample2sex ) + ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]); + else + ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); } if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno)); free(sample_fname); + free(sample2sex); } if (!gen_fname) { if ( str.m ) free(str.s); @@ -793,7 +831,7 @@ static void vcf_to_haplegendsample(args_t *args) char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL; str.l = 0; kputs(args->outfname,&str); - int n_files, i; + int n_files = 0, i; char **files = hts_readlist(str.s, 0, &n_files); if ( n_files==1 ) { @@ -829,7 +867,11 @@ static void vcf_to_haplegendsample(args_t *args) if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname); // write samples file - if (sample_fname) { + if (sample_fname) + { + char *sample2sex = NULL; + if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -839,12 +881,13 @@ static void vcf_to_haplegendsample(args_t *args) for (i=0; iheader); i++) { str.l = 0; - ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]); + ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2'); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); } if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno)); free(sample_fname); + free(sample2sex); } if (!hap_fname && !legend_fname) { if ( str.m ) free(str.s); @@ -853,6 +896,7 @@ static void vcf_to_haplegendsample(args_t *args) // open haps and legend outputs BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL; + if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize); BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL; if (legend_fname) { str.l = 0; @@ -940,7 +984,7 @@ static void vcf_to_hapsample(args_t *args) char *hap_fname = NULL, *sample_fname = NULL; str.l = 0; kputs(args->outfname,&str); - int n_files, i; + int n_files = 0, i; char **files = hts_readlist(str.s, 0, &n_files); if ( n_files==1 ) { @@ -970,22 +1014,30 @@ static void vcf_to_hapsample(args_t *args) if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname); // write samples file - if (sample_fname) { + if (sample_fname) + { + char *sample2sex = NULL; + if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; - kputs("ID_1 ID_2 missing\n0 0 0\n", &str); + kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); for (i=0; iheader); i++) { str.l = 0; - ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]); + if ( sample2sex ) + ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]); + else + ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); } if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno)); free(sample_fname); + free(sample2sex); } if (!hap_fname) { if ( str.m ) free(str.s); @@ -994,6 +1046,7 @@ static void vcf_to_hapsample(args_t *args) // open haps output BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL; + if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize); int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0; while ( bcf_sr_next_line(args->files) ) @@ -1256,9 +1309,30 @@ static void gvcf_to_vcf(args_t *args) if ( !pass ) continue; } - if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") ) + if (!bcf_has_filter(hdr,line,"PASS")) + { + bcf_write(out_fh,hdr,line); + continue; + } + + // check if alleles compatible with being a gVCF record + int i, gallele = -1; + if (line->n_allele==1) + gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present) + else + { + if ( line->d.allele[1][0]!='<' ) continue; + for (i=1; in_allele; i++) + { + if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF + if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF + if ( strcmp(line->d.allele[i],"")==0 ) { gallele = i; break; } // GATK gVCF + } + } + + // no gVCF compatible alleles + if (gallele<0) { - // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS bcf_write(out_fh,hdr,line); continue; } @@ -1266,7 +1340,7 @@ static void gvcf_to_vcf(args_t *args) int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp); if ( nend!=1 ) { - // No END lineord + // No INFO/END => not gVCF record bcf_write(out_fh,hdr,line); continue; } @@ -1277,10 +1351,9 @@ static void gvcf_to_vcf(args_t *args) line->pos = pos; char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1); - // we have already checked above that there is only one allele, - // so fine to just update alleles with the ref allele from the fasta - bcf_update_alleles_str(hdr, line, &ref[0]); + strncpy(line->d.allele[0],ref,len); bcf_write(out_fh,hdr,line); + free(ref); } } free(itmp); @@ -1316,6 +1389,7 @@ static void usage(void) fprintf(stderr, " -g, --gensample <...> |,\n"); fprintf(stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); fprintf(stderr, "gVCF conversion:\n"); @@ -1326,12 +1400,14 @@ static void usage(void) fprintf(stderr, " --hapsample2vcf <...> |,\n"); fprintf(stderr, " --hapsample <...> |,\n"); fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); fprintf(stderr, "HAP/LEGEND/SAMPLE conversion:\n"); fprintf(stderr, " -H, --haplegendsample2vcf <...> |,,\n"); fprintf(stderr, " -h, --haplegendsample <...> |,,\n"); fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); fprintf(stderr, "TSV conversion:\n"); @@ -1375,6 +1451,7 @@ int main_vcfconvert(int argc, char *argv[]) {"targets-file",required_argument,NULL,'T'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, + {"sex",required_argument,NULL,11}, {"gensample",required_argument,NULL,'g'}, {"gensample2vcf",required_argument,NULL,'G'}, {"tag",required_argument,NULL,1}, @@ -1428,6 +1505,7 @@ int main_vcfconvert(int argc, char *argv[]) case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 10 : args->record_cmd_line = 0; break; + case 11 : args->sex_fname = optarg; break; case '?': usage(); default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index 12333cc..4d3469c 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -67,7 +68,7 @@ struct _args_t int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; - char *outfname, *infname, *ref_fname; + char *outfname, *infname, *ref_fname, *sex_fname; int argc, n_threads, record_cmd_line; }; @@ -83,6 +84,9 @@ static void destroy_data(args_t *args) static void open_vcf(args_t *args, const char *format_str) { args->files = bcf_sr_init(); + if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 ) + error("Could not initialize --threads %d\n", args->n_threads); + if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) @@ -131,9 +135,6 @@ static void open_vcf(args_t *args, const char *format_str) } if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str); free(samples); - - if ( args->filter_str ) - args->filter = filter_init(args->header, args->filter_str); } static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) @@ -375,6 +376,7 @@ static void gensample_to_vcf(args_t *args) int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); + if ( !samples ) error("Could not read %s\n", sample_fname); for (i=2; in.total); } +char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) +{ + int i, nlines; + char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1); + char **lines = hts_readlist(sex_fname, 1, &nlines); + if ( !lines ) error("Could not read %s\n", sex_fname); + for (i=0; ioutfname,&str); - int n_files, i; + int n_files = 0, i; char **files = hts_readlist(str.s, 0, &n_files); if ( n_files==1 ) { @@ -714,22 +744,30 @@ static void vcf_to_gensample(args_t *args) if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); // write samples file - if (sample_fname) { + if (sample_fname) + { + char *sample2sex = NULL; + if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; - kputs("ID_1 ID_2 missing\n0 0 0\n", &str); + kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); for (i=0; iheader); i++) { str.l = 0; - ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]); + if ( sample2sex ) + ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]); + else + ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); } if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno)); free(sample_fname); + free(sample2sex); } if (!gen_fname) { if ( str.m ) free(str.s); @@ -795,7 +833,7 @@ static void vcf_to_haplegendsample(args_t *args) char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL; str.l = 0; kputs(args->outfname,&str); - int n_files, i; + int n_files = 0, i; char **files = hts_readlist(str.s, 0, &n_files); if ( n_files==1 ) { @@ -831,7 +869,11 @@ static void vcf_to_haplegendsample(args_t *args) if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); // write samples file - if (sample_fname) { + if (sample_fname) + { + char *sample2sex = NULL; + if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -841,12 +883,13 @@ static void vcf_to_haplegendsample(args_t *args) for (i=0; iheader); i++) { str.l = 0; - ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]); + ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2'); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); } if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno)); free(sample_fname); + free(sample2sex); } if (!hap_fname && !legend_fname) { if ( str.m ) free(str.s); @@ -855,6 +898,7 @@ static void vcf_to_haplegendsample(args_t *args) // open haps and legend outputs BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL; + if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize); BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL; if (legend_fname) { str.l = 0; @@ -942,7 +986,7 @@ static void vcf_to_hapsample(args_t *args) char *hap_fname = NULL, *sample_fname = NULL; str.l = 0; kputs(args->outfname,&str); - int n_files, i; + int n_files = 0, i; char **files = hts_readlist(str.s, 0, &n_files); if ( n_files==1 ) { @@ -972,22 +1016,30 @@ static void vcf_to_hapsample(args_t *args) if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); // write samples file - if (sample_fname) { + if (sample_fname) + { + char *sample2sex = NULL; + if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; - kputs("ID_1 ID_2 missing\n0 0 0\n", &str); + kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); for (i=0; iheader); i++) { str.l = 0; - ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]); + if ( sample2sex ) + ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]); + else + ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]); ret = bgzf_write(sout, str.s, str.l); if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno)); } if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno)); free(sample_fname); + free(sample2sex); } if (!hap_fname) { if ( str.m ) free(str.s); @@ -996,6 +1048,7 @@ static void vcf_to_hapsample(args_t *args) // open haps output BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL; + if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize); int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0; while ( bcf_sr_next_line(args->files) ) @@ -1258,9 +1311,30 @@ static void gvcf_to_vcf(args_t *args) if ( !pass ) continue; } - if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") ) + if (!bcf_has_filter(hdr,line,"PASS")) + { + bcf_write(out_fh,hdr,line); + continue; + } + + // check if alleles compatible with being a gVCF record + int i, gallele = -1; + if (line->n_allele==1) + gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present) + else + { + if ( line->d.allele[1][0]!='<' ) continue; + for (i=1; in_allele; i++) + { + if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF + if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF + if ( strcmp(line->d.allele[i],"")==0 ) { gallele = i; break; } // GATK gVCF + } + } + + // no gVCF compatible alleles + if (gallele<0) { - // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS bcf_write(out_fh,hdr,line); continue; } @@ -1268,7 +1342,7 @@ static void gvcf_to_vcf(args_t *args) int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp); if ( nend!=1 ) { - // No END lineord + // No INFO/END => not gVCF record bcf_write(out_fh,hdr,line); continue; } @@ -1279,10 +1353,9 @@ static void gvcf_to_vcf(args_t *args) line->pos = pos; char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1); - // we have already checked above that there is only one allele, - // so fine to just update alleles with the ref allele from the fasta - bcf_update_alleles_str(hdr, line, &ref[0]); + strncpy(line->d.allele[0],ref,len); bcf_write(out_fh,hdr,line); + free(ref); } } free(itmp); @@ -1318,6 +1391,7 @@ static void usage(void) fprintf(pysam_stderr, " -g, --gensample <...> |,\n"); fprintf(pysam_stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(pysam_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(pysam_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(pysam_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "gVCF conversion:\n"); @@ -1328,12 +1402,14 @@ static void usage(void) fprintf(pysam_stderr, " --hapsample2vcf <...> |,\n"); fprintf(pysam_stderr, " --hapsample <...> |,\n"); fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(pysam_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "HAP/LEGEND/SAMPLE conversion:\n"); fprintf(pysam_stderr, " -H, --haplegendsample2vcf <...> |,,\n"); fprintf(pysam_stderr, " -h, --haplegendsample <...> |,,\n"); fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(pysam_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "TSV conversion:\n"); @@ -1377,6 +1453,7 @@ int main_vcfconvert(int argc, char *argv[]) {"targets-file",required_argument,NULL,'T'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, + {"sex",required_argument,NULL,11}, {"gensample",required_argument,NULL,'g'}, {"gensample2vcf",required_argument,NULL,'G'}, {"tag",required_argument,NULL,1}, @@ -1430,6 +1507,7 @@ int main_vcfconvert(int argc, char *argv[]) case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 10 : args->record_cmd_line = 0; break; + case 11 : args->sex_fname = optarg; break; case '?': usage(); default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index f979d77..c1b41f2 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -129,7 +129,8 @@ static void init_data(args_t *args) if ( tmp.s ) kputs(" and ", &tmp); kputs("\"IndelGap\"", &tmp); } - fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); + if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) ) + fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); free(tmp.s); } diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index 58193da..e603bde 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -131,7 +131,8 @@ static void init_data(args_t *args) if ( tmp.s ) kputs(" and ", &tmp); kputs("\"IndelGap\"", &tmp); } - fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); + if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) ) + fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); free(tmp.s); } diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index b741ef6..8835db3 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -35,7 +35,9 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" +#include "hclust.h" typedef struct { @@ -43,10 +45,10 @@ typedef struct bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF int ntmp_arr, npl_arr; int32_t *tmp_arr, *pl_arr; - double *lks, *sites; + double *lks, *sites, min_inter_err, max_intra_err; int *cnts, *dps, hom_only, cross_check, all_sites; char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample; - int argc, no_PLs; + int argc, no_PLs, narr, nsmpl; } args_t; @@ -133,6 +135,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample) free(fname); } +#if 0 static void plot_cross_check(args_t *args) { char *fname; @@ -214,6 +217,7 @@ static void plot_cross_check(args_t *args) py_plot(fname); free(fname); } +#endif static void init_data(args_t *args) { @@ -230,14 +234,6 @@ static void init_data(args_t *args) args->sites = (double*) calloc(nsamples,sizeof(double)); args->dps = (int*) calloc(nsamples,sizeof(int)); } - else - { - int nsamples = bcf_hdr_nsamples(args->sm_hdr); - int narr = (nsamples-1)*nsamples/2; - args->lks = (double*) calloc(narr,sizeof(double)); - args->cnts = (int*) calloc(narr,sizeof(int)); - args->dps = (int*) calloc(narr,sizeof(int)); - } } static void destroy_data(args_t *args) @@ -524,177 +520,181 @@ static void check_gt(args_t *args) } } -static inline int is_hom_most_likely(int nals, int *pls) +// static inline int is_hom_most_likely(int nals, int *pls) +// { +// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0]; +// for (ia=1; iasm_hdr, line, &args->tmp_arr, &args->ntmp_arr); + + if ( ngt<=0 ) return 1; // GT not present + if ( ngt!=args->nsmpl*2 ) return 2; // not diploid + ngt /= args->nsmpl; + + int i,j, idx = 0; + for (i=1; insmpl; i++) + { + int32_t *a = args->tmp_arr + i*ngt; + if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; } + int agt = 1<tmp_arr + j*ngt; + if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; } + int bgt = 1<sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr); + + if ( npl<=0 ) return 1; // PL not present + npl /= args->nsmpl; + + int i,j,k, idx = 0; + for (i=1; insmpl; i++) { - for (ib=0; ibtmp_arr + i*npl; + int imin = -1; + for (k=0; k a[k] ) imin = k; + } + if ( imin<0 ) { idx+=i; continue; } + + for (j=0; jtmp_arr + j*npl; + int jmin = -1; + for (k=0; k b[k] ) jmin = k; + } + if ( jmin<0 ) { idx++; continue; } + + ntot[idx]++; + if ( imin!=jmin ) ndif[idx]++; idx++; } - if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; } - idx++; } - return min_is_hom; + return 0; } static void cross_check_gts(args_t *args) { - int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0; - unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day... - int fake_pls = args->no_PLs, ignore_dp = 0; - - int i,j,k,idx, pl_warned = 0, dp_warned = 0; - int32_t *dp_arr = NULL; - int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL; + // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); - if ( !args->no_PLs ) + if ( !args->no_PLs ) { fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); - fake_pls = 1; + args->no_PLs = 99; + } } - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; - print_header(args, fp); - if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); + args->nsmpl = bcf_hdr_nsamples(args->sm_hdr); + args->narr = (args->nsmpl-1)*args->nsmpl/2; + + uint32_t *ndif = (uint32_t*) calloc(args->narr,4); + uint32_t *ntot = (uint32_t*) calloc(args->narr,4); while ( bcf_sr_next_line(args->files) ) { - bcf1_t *line = args->files->readers[0].buffer[0]; - bcf_unpack(line, BCF_UN_FMT); - - int npl; - if ( !fake_pls ) - { - npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr); - if ( npl<=0 ) { pl_warned++; continue; } - npl /= nsamples; - } - else - npl = fake_PLs(args, args->sm_hdr, line); - int mdp = 0; - if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++; + bcf1_t *line = bcf_sr_get_line(args->files,0); - if ( args->hom_only ) + // use PLs unless no_PLs is set and GT exists + if ( args->no_PLs ) { - for (i=0; in_allele, args->pl_arr+i*npl); + if ( process_GT(args,line,ntot,ndif)==0 ) continue; } - - double sum = 0; int nsum = 0; - idx = 0; - for (i=0; ipl_arr[i*npl]; - if ( *ipl==-1 ) { idx += i; continue; } // missing genotype - if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; } - if ( args->hom_only && !is_hom[i] ) { idx += i; continue; } - - for (j=0; jpl_arr[j*npl]; - if ( *jpl==-1 ) { idx++; continue; } // missing genotype - if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; } - if ( args->hom_only && !is_hom[j] ) { idx++; continue; } - - int min_pl = INT_MAX; - for (k=0; k ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k]; - } - if ( k!=npl ) { idx++; continue; } - - if ( args->all_sites ) { sum += min_pl; nsum++; } - args->lks[idx] += min_pl; - args->cnts[idx]++; - - if ( mdp>0 ) - { - args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j]; - dp[i] += dp_arr[i]; ndp[i]++; - dp[j] += dp_arr[j]; ndp[j]++; - } - else - { - args->dps[idx]++; - dp[i]++; ndp[i]++; - dp[j]++; ndp[j]++; - } - idx++; - } - } - if ( args->all_sites ) - fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0); + process_PL(args,line,ntot,ndif); } - if ( dp_arr ) free(dp_arr); - if ( args->pl_arr ) free(args->pl_arr); - if ( args->tmp_arr ) free(args->tmp_arr); - if ( is_hom ) free(is_hom); + + FILE *fp = stdout; + print_header(args, fp); - if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); - if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); + float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); - // Output samples sorted by average discordance - double *score = (double*) calloc(nsamples,sizeof(double)); - args->sites = (double*) calloc(nsamples,sizeof(double)); - idx = 0; - for (i=0; insmpl; i++) { for (j=0; jlks[idx]; - score[j] += args->lks[idx]; - args->sites[i] += args->cnts[idx]; - args->sites[j] += args->cnts[idx]; + float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10; + fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); + PDIST(tmp,i,j) = err; idx++; } } - for (i=0; isites[i] ) score[i] /= args->sites[i]; - double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0; - for (i=0; imax_intra_err; + hclust_t *clust = hclust_init(args->nsmpl,tmp); + cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist); + fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n"); + for (i=0; isites[idx]/(nsamples-1); - avg_score += score[idx]; - fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i); + fprintf(fp,"CLUSTER\t%f", list[i].dist); + for (j=0; jsm_hdr->samples[list[i].memb[j]]); + fprintf(fp,"\n"); } - - // // Overall score: maximum absolute deviation from the average score - // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n"); - // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set - free(p); - free(score); - free(dp); - free(ndp); - - // Pairwise discordances + hclust_destroy_list(list,nlist); + // Debugging output: the cluster graph and data used for deciding + char **dbg = hclust_explain(clust,&nlist); + for (i=0; ism_hdr->samples,clust_max_err)); + hclust_destroy(clust); + free(tmp); + + + // Deprecated output for temporary backward compatibility + fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n"); fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; - for (i=0; insmpl; i++) { for (j=0; jlks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, - args->sm_hdr->samples[i],args->sm_hdr->samples[j]); + fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } - fclose(fp); - if ( args->plot ) - plot_cross_check(args); + + free(ndif); + free(ntot); + free(args->tmp_arr); } static char *init_prefix(char *prefix) @@ -713,6 +713,7 @@ static void usage(void) fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -a, --all-sites output comparison for all sites\n"); + fprintf(stderr, " -c, --cluster min inter- and max intra-sample error [0.23,-0.3]\n"); fprintf(stderr, " -g, --genotypes genotypes to compare against\n"); fprintf(stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); fprintf(stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); @@ -736,8 +737,16 @@ int main_vcfgtcheck(int argc, char *argv[]) char *regions = NULL, *targets = NULL; int regions_is_file = 0, targets_is_file = 0; + // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 + // - min_inter: pairs with smaller err value will be considered identical + // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered + // different. If negative, the cutoff may be heuristically lowered + args->min_inter_err = 0.23; + args->max_intra_err = -0.3; + static struct option loptions[] = { + {"cluster",1,0,'c'}, {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, {"homs-only",0,0,'H'}, @@ -753,8 +762,17 @@ int main_vcfgtcheck(int argc, char *argv[]) {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) { switch (c) { + case 'c': + args->min_inter_err = strtod(optarg,&tmp); + if ( *tmp ) + { + if ( *tmp!=',') error("Could not parse: -c %s\n", optarg); + args->max_intra_err = strtod(tmp+1,&tmp); + if ( *tmp ) error("Could not parse: -c %s\n", optarg); + } + break; case 'G': args->no_PLs = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg); diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index 2f0a288..0bd6071 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -37,7 +37,9 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" +#include "hclust.h" typedef struct { @@ -45,10 +47,10 @@ typedef struct bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF int ntmp_arr, npl_arr; int32_t *tmp_arr, *pl_arr; - double *lks, *sites; + double *lks, *sites, min_inter_err, max_intra_err; int *cnts, *dps, hom_only, cross_check, all_sites; char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample; - int argc, no_PLs; + int argc, no_PLs, narr, nsmpl; } args_t; @@ -135,6 +137,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample) free(fname); } +#if 0 static void plot_cross_check(args_t *args) { char *fname; @@ -216,6 +219,7 @@ static void plot_cross_check(args_t *args) py_plot(fname); free(fname); } +#endif static void init_data(args_t *args) { @@ -232,14 +236,6 @@ static void init_data(args_t *args) args->sites = (double*) calloc(nsamples,sizeof(double)); args->dps = (int*) calloc(nsamples,sizeof(int)); } - else - { - int nsamples = bcf_hdr_nsamples(args->sm_hdr); - int narr = (nsamples-1)*nsamples/2; - args->lks = (double*) calloc(narr,sizeof(double)); - args->cnts = (int*) calloc(narr,sizeof(int)); - args->dps = (int*) calloc(narr,sizeof(int)); - } } static void destroy_data(args_t *args) @@ -526,177 +522,181 @@ static void check_gt(args_t *args) } } -static inline int is_hom_most_likely(int nals, int *pls) +// static inline int is_hom_most_likely(int nals, int *pls) +// { +// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0]; +// for (ia=1; iasm_hdr, line, &args->tmp_arr, &args->ntmp_arr); + + if ( ngt<=0 ) return 1; // GT not present + if ( ngt!=args->nsmpl*2 ) return 2; // not diploid + ngt /= args->nsmpl; + + int i,j, idx = 0; + for (i=1; insmpl; i++) + { + int32_t *a = args->tmp_arr + i*ngt; + if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; } + int agt = 1<tmp_arr + j*ngt; + if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; } + int bgt = 1<sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr); + + if ( npl<=0 ) return 1; // PL not present + npl /= args->nsmpl; + + int i,j,k, idx = 0; + for (i=1; insmpl; i++) { - for (ib=0; ibtmp_arr + i*npl; + int imin = -1; + for (k=0; k a[k] ) imin = k; + } + if ( imin<0 ) { idx+=i; continue; } + + for (j=0; jtmp_arr + j*npl; + int jmin = -1; + for (k=0; k b[k] ) jmin = k; + } + if ( jmin<0 ) { idx++; continue; } + + ntot[idx]++; + if ( imin!=jmin ) ndif[idx]++; idx++; } - if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; } - idx++; } - return min_is_hom; + return 0; } static void cross_check_gts(args_t *args) { - int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0; - unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day... - int fake_pls = args->no_PLs, ignore_dp = 0; - - int i,j,k,idx, pl_warned = 0, dp_warned = 0; - int32_t *dp_arr = NULL; - int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL; + // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); - if ( !args->no_PLs ) + if ( !args->no_PLs ) { fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); - fake_pls = 1; + args->no_PLs = 99; + } } - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout; - print_header(args, fp); - if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); + args->nsmpl = bcf_hdr_nsamples(args->sm_hdr); + args->narr = (args->nsmpl-1)*args->nsmpl/2; + + uint32_t *ndif = (uint32_t*) calloc(args->narr,4); + uint32_t *ntot = (uint32_t*) calloc(args->narr,4); while ( bcf_sr_next_line(args->files) ) { - bcf1_t *line = args->files->readers[0].buffer[0]; - bcf_unpack(line, BCF_UN_FMT); - - int npl; - if ( !fake_pls ) - { - npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr); - if ( npl<=0 ) { pl_warned++; continue; } - npl /= nsamples; - } - else - npl = fake_PLs(args, args->sm_hdr, line); - int mdp = 0; - if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++; + bcf1_t *line = bcf_sr_get_line(args->files,0); - if ( args->hom_only ) + // use PLs unless no_PLs is set and GT exists + if ( args->no_PLs ) { - for (i=0; in_allele, args->pl_arr+i*npl); + if ( process_GT(args,line,ntot,ndif)==0 ) continue; } - - double sum = 0; int nsum = 0; - idx = 0; - for (i=0; ipl_arr[i*npl]; - if ( *ipl==-1 ) { idx += i; continue; } // missing genotype - if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; } - if ( args->hom_only && !is_hom[i] ) { idx += i; continue; } - - for (j=0; jpl_arr[j*npl]; - if ( *jpl==-1 ) { idx++; continue; } // missing genotype - if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; } - if ( args->hom_only && !is_hom[j] ) { idx++; continue; } - - int min_pl = INT_MAX; - for (k=0; k ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k]; - } - if ( k!=npl ) { idx++; continue; } - - if ( args->all_sites ) { sum += min_pl; nsum++; } - args->lks[idx] += min_pl; - args->cnts[idx]++; - - if ( mdp>0 ) - { - args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j]; - dp[i] += dp_arr[i]; ndp[i]++; - dp[j] += dp_arr[j]; ndp[j]++; - } - else - { - args->dps[idx]++; - dp[i]++; ndp[i]++; - dp[j]++; ndp[j]++; - } - idx++; - } - } - if ( args->all_sites ) - fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0); + process_PL(args,line,ntot,ndif); } - if ( dp_arr ) free(dp_arr); - if ( args->pl_arr ) free(args->pl_arr); - if ( args->tmp_arr ) free(args->tmp_arr); - if ( is_hom ) free(is_hom); + + FILE *fp = pysam_stdout; + print_header(args, fp); - if ( pl_warned ) fprintf(pysam_stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); - if ( dp_warned ) fprintf(pysam_stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); + float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); - // Output samples sorted by average discordance - double *score = (double*) calloc(nsamples,sizeof(double)); - args->sites = (double*) calloc(nsamples,sizeof(double)); - idx = 0; - for (i=0; insmpl; i++) { for (j=0; jlks[idx]; - score[j] += args->lks[idx]; - args->sites[i] += args->cnts[idx]; - args->sites[j] += args->cnts[idx]; + float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10; + fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); + PDIST(tmp,i,j) = err; idx++; } } - for (i=0; isites[i] ) score[i] /= args->sites[i]; - double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0; - for (i=0; imax_intra_err; + hclust_t *clust = hclust_init(args->nsmpl,tmp); + cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist); + fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n"); + for (i=0; isites[idx]/(nsamples-1); - avg_score += score[idx]; - fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i); + fprintf(fp,"CLUSTER\t%f", list[i].dist); + for (j=0; jsm_hdr->samples[list[i].memb[j]]); + fprintf(fp,"\n"); } - - // // Overall score: maximum absolute deviation from the average score - // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n"); - // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set - free(p); - free(score); - free(dp); - free(ndp); - - // Pairwise discordances + hclust_destroy_list(list,nlist); + // Debugging output: the cluster graph and data used for deciding + char **dbg = hclust_explain(clust,&nlist); + for (i=0; ism_hdr->samples,clust_max_err)); + hclust_destroy(clust); + free(tmp); + + + // Deprecated output for temporary backward compatibility + fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n"); fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; - for (i=0; insmpl; i++) { for (j=0; jlks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, - args->sm_hdr->samples[i],args->sm_hdr->samples[j]); + fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } - fclose(fp); - if ( args->plot ) - plot_cross_check(args); + + free(ndif); + free(ntot); + free(args->tmp_arr); } static char *init_prefix(char *prefix) @@ -715,6 +715,7 @@ static void usage(void) fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "Options:\n"); fprintf(pysam_stderr, " -a, --all-sites output comparison for all sites\n"); + fprintf(pysam_stderr, " -c, --cluster min inter- and max intra-sample error [0.23,-0.3]\n"); fprintf(pysam_stderr, " -g, --genotypes genotypes to compare against\n"); fprintf(pysam_stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); fprintf(pysam_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); @@ -738,8 +739,16 @@ int main_vcfgtcheck(int argc, char *argv[]) char *regions = NULL, *targets = NULL; int regions_is_file = 0, targets_is_file = 0; + // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 + // - min_inter: pairs with smaller err value will be considered identical + // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered + // different. If negative, the cutoff may be heuristically lowered + args->min_inter_err = 0.23; + args->max_intra_err = -0.3; + static struct option loptions[] = { + {"cluster",1,0,'c'}, {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, {"homs-only",0,0,'H'}, @@ -755,8 +764,17 @@ int main_vcfgtcheck(int argc, char *argv[]) {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) { switch (c) { + case 'c': + args->min_inter_err = strtod(optarg,&tmp); + if ( *tmp ) + { + if ( *tmp!=',') error("Could not parse: -c %s\n", optarg); + args->max_intra_err = strtod(tmp+1,&tmp); + if ( *tmp ) error("Could not parse: -c %s\n", optarg); + } + break; case 'G': args->no_PLs = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg); diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index d1e9179..aa60fb2 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -1,4 +1,3 @@ - /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. Copyright (C) 2014-2016 Genome Research Ltd. @@ -32,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #define __STDC_FORMAT_MACROS #include +#include #include "bcftools.h" #define BCF_LIDX_SHIFT 14 @@ -43,24 +43,22 @@ static void usage(void) fprintf(stderr, "Usage: bcftools index [options] |\n"); fprintf(stderr, "\n"); fprintf(stderr, "Indexing options:\n"); - fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); - fprintf(stderr, " -f, --force overwrite index if it already exists\n"); - fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n"); + fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); + fprintf(stderr, " -f, --force overwrite index if it already exists\n"); + fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); + fprintf(stderr, " -o, --output-file FILE optional output index file name\n"); + fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n"); + fprintf(stderr, " --threads sets the number of threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Stats options:\n"); fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n"); - fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n"); + fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n"); fprintf(stderr, "\n"); exit(1); } int vcf_index_stats(char *fname, int stats) { - char *fn_out = NULL; - FILE *out; - out = fn_out ? fopen(fn_out, "w") : stdout; - const char **seq; int i, nseq; tbx_t *tbx = NULL; @@ -74,12 +72,12 @@ int vcf_index_stats(char *fname, int stats) if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); - if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; } + if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); - if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; } + if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; } } else { @@ -97,7 +95,7 @@ int vcf_index_stats(char *fname, int stats) if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); + printf("%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { @@ -106,14 +104,13 @@ int vcf_index_stats(char *fname, int stats) bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { - fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); + fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname); return 1; } bcf_destroy1(rec); } - if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); + if (stats&2) printf("%" PRIu64 "\n", sum); free(seq); - fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) @@ -125,8 +122,9 @@ int vcf_index_stats(char *fname, int stats) int main_vcfindex(int argc, char *argv[]) { - int c, force = 0, tbi = 0, stats = 0; + int c, force = 0, tbi = 0, stats = 0, n_threads = 0; int min_shift = BCF_LIDX_SHIFT; + char *outfn = NULL; static struct option loptions[] = { @@ -136,27 +134,33 @@ int main_vcfindex(int argc, char *argv[]) {"min-shift",required_argument,NULL,'m'}, {"stats",no_argument,NULL,'s'}, {"nrecords",no_argument,NULL,'n'}, + {"threads",required_argument,NULL,9}, + {"output-file",required_argument,NULL,'o'}, {NULL, 0, NULL, 0} }; char *tmp; - while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0) { switch (c) { case 'c': tbi = 0; break; case 't': tbi = 1; min_shift = 0; break; case 'f': force = 1; break; - case 'm': + case 'm': min_shift = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg); break; case 's': stats |= 1; break; case 'n': stats |= 2; break; + case 9: + n_threads = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); + break; + case 'o': outfn = optarg; break; default: usage(); } } - if ( optind==argc ) usage(); if (stats>2) { fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); @@ -173,69 +177,48 @@ int main_vcfindex(int argc, char *argv[]) return 1; } - char *fname = argv[optind]; - if (stats) return vcf_index_stats(fname, stats); - - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) error("Failed to read %s\n", fname); - htsFormat type = *hts_get_format(fp); - hts_close(fp); - - if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf ) + char *fname = NULL; + if ( optind>=argc ) { - fprintf(stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__); - if ( type.compression!=bgzf ) - fprintf(stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__); - return 1; - } - if (tbi && type.format==bcf) - { - fprintf(stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n"); - tbi = 0; min_shift = BCF_LIDX_SHIFT; + if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + else usage(); } - if (min_shift == 0 && type.format==bcf) - { - fprintf(stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__); - return 1; - } - if (!tbi && type.format==vcf && min_shift == 0) + else fname = argv[optind]; + if (stats) return vcf_index_stats(fname, stats); + + kstring_t idx_fname = {0,0,0}; + if (outfn) + kputs(outfn,&idx_fname); + else { - fprintf(stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n"); - tbi = 1; + if (!strcmp(fname, "-")) { fprintf(stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; } + ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi"); } - if (!force) { // Before complaining about existing index, check if the VCF file isn't newer. - char *idx_fname = (char*)alloca(strlen(fname) + 5); - strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi"); struct stat stat_tbi, stat_file; - if ( stat(idx_fname, &stat_tbi)==0 ) + if ( stat(idx_fname.s, &stat_tbi)==0 ) { stat(fname, &stat_file); if ( stat_file.st_mtime <= stat_tbi.st_mtime ) { - fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__); + fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s); + free(idx_fname.s); return 1; } } } - if (type.format==bcf) - { - if ( bcf_index_build(fname, min_shift) != 0 ) - { - fprintf(stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname); - return 1; - } - } - else - { - if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 ) - { - fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname); - return 1; - } + int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads); + free(idx_fname.s); + if (ret != 0) { + if (ret == -2) + error("index: failed to open \"%s\"\n", fname); + else if (ret == -3) + error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname); + else + error("index: failed to create index for \"%s\"\n", fname); } return 0; } diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index 479fc57..ff960b9 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -1,6 +1,5 @@ #include "pysam.h" - /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. Copyright (C) 2014-2016 Genome Research Ltd. @@ -34,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #define __STDC_FORMAT_MACROS #include +#include #include "bcftools.h" #define BCF_LIDX_SHIFT 14 @@ -45,24 +45,22 @@ static void usage(void) fprintf(pysam_stderr, "Usage: bcftools index [options] |\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "Indexing options:\n"); - fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); - fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n"); - fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); + fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); + fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n"); + fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); + fprintf(pysam_stderr, " -o, --output-file FILE optional output index file name\n"); + fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); + fprintf(pysam_stderr, " --threads sets the number of threads [0]\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "Stats options:\n"); fprintf(pysam_stderr, " -n, --nrecords print number of records based on existing index file\n"); - fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n"); + fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n"); fprintf(pysam_stderr, "\n"); exit(1); } int vcf_index_stats(char *fname, int stats) { - char *fn_out = NULL; - FILE *out; - out = fn_out ? fopen(fn_out, "w") : pysam_stdout; - const char **seq; int i, nseq; tbx_t *tbx = NULL; @@ -76,12 +74,12 @@ int vcf_index_stats(char *fname, int stats) if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); - if ( !tbx ) { fprintf(pysam_stderr,"Could not load TBI index: %s\n", fname); return 1; } + if ( !tbx ) { fprintf(pysam_stderr,"Could not load index for VCF: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); - if ( !idx ) { fprintf(pysam_stderr,"Could not load CSI index: %s\n", fname); return 1; } + if ( !idx ) { fprintf(pysam_stderr,"Could not load index for BCF file: %s\n", fname); return 1; } } else { @@ -99,7 +97,7 @@ int vcf_index_stats(char *fname, int stats) if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); + fprintf(pysam_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { @@ -108,14 +106,13 @@ int vcf_index_stats(char *fname, int stats) bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { - fprintf(pysam_stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); + fprintf(pysam_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname); return 1; } bcf_destroy1(rec); } - if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); + if (stats&2) fprintf(pysam_stdout, "%" PRIu64 "\n", sum); free(seq); - fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) @@ -127,8 +124,9 @@ int vcf_index_stats(char *fname, int stats) int main_vcfindex(int argc, char *argv[]) { - int c, force = 0, tbi = 0, stats = 0; + int c, force = 0, tbi = 0, stats = 0, n_threads = 0; int min_shift = BCF_LIDX_SHIFT; + char *outfn = NULL; static struct option loptions[] = { @@ -138,27 +136,33 @@ int main_vcfindex(int argc, char *argv[]) {"min-shift",required_argument,NULL,'m'}, {"stats",no_argument,NULL,'s'}, {"nrecords",no_argument,NULL,'n'}, + {"threads",required_argument,NULL,9}, + {"output-file",required_argument,NULL,'o'}, {NULL, 0, NULL, 0} }; char *tmp; - while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0) { switch (c) { case 'c': tbi = 0; break; case 't': tbi = 1; min_shift = 0; break; case 'f': force = 1; break; - case 'm': + case 'm': min_shift = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg); break; case 's': stats |= 1; break; case 'n': stats |= 2; break; + case 9: + n_threads = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); + break; + case 'o': outfn = optarg; break; default: usage(); } } - if ( optind==argc ) usage(); if (stats>2) { fprintf(pysam_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); @@ -175,69 +179,48 @@ int main_vcfindex(int argc, char *argv[]) return 1; } - char *fname = argv[optind]; - if (stats) return vcf_index_stats(fname, stats); - - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) error("Failed to read %s\n", fname); - htsFormat type = *hts_get_format(fp); - hts_close(fp); - - if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf ) + char *fname = NULL; + if ( optind>=argc ) { - fprintf(pysam_stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__); - if ( type.compression!=bgzf ) - fprintf(pysam_stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__); - return 1; - } - if (tbi && type.format==bcf) - { - fprintf(pysam_stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n"); - tbi = 0; min_shift = BCF_LIDX_SHIFT; + if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + else usage(); } - if (min_shift == 0 && type.format==bcf) - { - fprintf(pysam_stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__); - return 1; - } - if (!tbi && type.format==vcf && min_shift == 0) + else fname = argv[optind]; + if (stats) return vcf_index_stats(fname, stats); + + kstring_t idx_fname = {0,0,0}; + if (outfn) + kputs(outfn,&idx_fname); + else { - fprintf(pysam_stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n"); - tbi = 1; + if (!strcmp(fname, "-")) { fprintf(pysam_stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; } + ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi"); } - if (!force) { // Before complaining about existing index, check if the VCF file isn't newer. - char *idx_fname = (char*)alloca(strlen(fname) + 5); - strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi"); struct stat stat_tbi, stat_file; - if ( stat(idx_fname, &stat_tbi)==0 ) + if ( stat(idx_fname.s, &stat_tbi)==0 ) { stat(fname, &stat_file); if ( stat_file.st_mtime <= stat_tbi.st_mtime ) { - fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__); + fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s); + free(idx_fname.s); return 1; } } } - if (type.format==bcf) - { - if ( bcf_index_build(fname, min_shift) != 0 ) - { - fprintf(pysam_stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname); - return 1; - } - } - else - { - if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 ) - { - fprintf(pysam_stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname); - return 1; - } + int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads); + free(idx_fname.s); + if (ret != 0) { + if (ret == -2) + error("index: failed to open \"%s\"\n", fname); + else if (ret == -3) + error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname); + else + error("index: failed to create index for \"%s\"\n", fname); } return 0; } diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 02fac6b..1aeb739 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -24,28 +24,39 @@ THE SOFTWARE. */ #include #include +#include #include #include #include #include #include #include +#include #include #include +#include #include "bcftools.h" +#include "regidx.h" #include "vcmp.h" +#define DBG 0 + #include KHASH_MAP_INIT_STR(strdict, int) typedef khash_t(strdict) strdict_t; -#define SKIP_DONE 1 -#define SKIP_DIFF 2 +#define FLT_LOGIC_ADD 0 +#define FLT_LOGIC_REMOVE 1 + +#define SKIP_DONE 1 // the record was processed +#define SKIP_DIFF 2 // not compatible, merge later #define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G) #define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A) #define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R) +#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; } + // For merging INFO Number=A,G,R tags typedef struct { @@ -63,43 +74,61 @@ typedef struct _info_rule_t void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule); int type; // one of BCF_HT_* int block_size; // number of values in a block + int type_size; // size of the corresponding BCF_HT_* type int nblocks; // number of blocks in nvals (the number of merged files) int nvals, mvals; // used and total size of vals array void *vals; // the info tag values } info_rule_t; +typedef struct +{ + bcf1_t *line; + int end, active; +} +gvcf_aux_t; + // Auxiliary merge data for selecting the right combination // of buffered records across multiple readers. maux1_t // corresponds to one buffered line. typedef struct { int skip; - int *map; // mapping from input alleles to the output array + int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles) int mmap; // size of map array (only buffer[i].n_allele is actually used) int als_differ; } maux1_t; typedef struct { - int n; // number of readers + int rid; // current rid + int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush. + int cur; // current line or -1 if none + int npos; // number of unprocessed lines at this position + int mrec; // allocated size of buf + maux1_t *rec; // buffer to keep reader's lines + bcf1_t **lines; // source buffer: either gvcf or readers' buffer +} +buffer_t; +typedef struct +{ + int n, pos, var_types; // number of readers, current position, currently available variant types + char *chr; // current chromosome char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output int nals, mals, nout_als, mout_als; // size of the output array int *cnt, ncnt; // number of records that refer to the alleles - int *nbuf; // readers have buffers of varying lengths int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases) - int *flt, mflt, minf; - bcf_info_t *inf;// out_line's INFO fields bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT int nfmt_map; // number of rows in the fmt_map array int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes void *tmp_arr; int ntmp_arr; - maux1_t **d; // d[i][j] i-th reader, j-th buffer line + buffer_t *buf; AGR_info_t *AGR_info; int nAGR_info, mAGR_info; bcf_srs_t *files; - int *has_line; // which files are being merged + int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present + gvcf_aux_t *gvcf; // buffer of gVCF lines } maux_t; @@ -107,8 +136,11 @@ typedef struct { vcmp_t *vcmp; maux_t *maux; - int header_only, collapse, output_type, force_samples, merge_by_id; + regidx_t *regs; // apply regions only after the blocks are expanded + regitr_t *regs_itr; + int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref; char *header_fname, *output_fname, *regions_list, *info_rules, *file_list; + faidx_t *gvcf_fai; info_rule_t *rules; int nrules; strdict_t *tmph; @@ -122,6 +154,14 @@ typedef struct } args_t; +static bcf1_t *maux_get_line(args_t *args, int i) +{ + maux_t *ma = args->maux; + int ibuf = ma->buf[i].cur; + if ( ibuf >= 0 ) return ma->buf[i].lines[ibuf]; + return NULL; +} + static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule) { if ( !rule->nvals ) return; @@ -247,6 +287,32 @@ static void info_rules_init(args_t *args) if ( str.l ) kputc(',',&str); kputs("DP4:sum",&str); } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "QS")) ) + { + if ( str.l ) kputc(',',&str); + kputs("QS:sum",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) ) + { + if ( str.l ) kputc(',',&str); + kputs("MinDP:min",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "I16")) ) + { + if ( str.l ) kputc(',',&str); + kputs("I16:sum",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IDV")) ) + { + if ( str.l ) kputc(',',&str); + kputs("IDV:max",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IMF")) ) + { + if ( str.l ) kputc(',',&str); + kputs("IMF:max",&str); + } + if ( !str.l ) return; args->info_rules = str.s; } @@ -272,9 +338,12 @@ static void info_rules_init(args_t *args) int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag); rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); - if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag); + if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); + else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); + else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); + else error("The type is not supported: \"%s\"\n", rule->hdr_tag); - while ( *ss ) ss++; ss++; + ss = strchr(ss, '\0'); ss++; if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag); int is_join = 0; @@ -300,7 +369,8 @@ static void info_rules_init(args_t *args) error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag); } - while ( *ss ) ss++; ss++; n++; + ss = strchr(ss, '\0'); ss++; + n++; } free(str.s); free(tmp); @@ -326,8 +396,10 @@ static void info_rules_reset(args_t *args) } static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len) { - int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type); + int msize = args->maux->ntmp_arr / rule->type_size; + int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type); if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret); + args->maux->ntmp_arr = msize * rule->type_size; rule->nblocks++; @@ -345,7 +417,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf int i, j; if ( var_len==BCF_VL_A ) { - assert( ret==line->n_allele-1 ); + if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); args->maux->nagr_map = ret; hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); // create mapping from source file ALT indexes to dst file indexes @@ -354,7 +426,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf } else if ( var_len==BCF_VL_R ) { - assert( ret==line->n_allele ); + if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); args->maux->nagr_map = ret; hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); for (i=0; imaux->agr_map[i] = als->map[i]; @@ -556,6 +628,8 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) { for (i=0; i<*nb; i++) { + if ( b[i][0]=='<' ) continue; // symbolic allele, do not modify + if ( b[i][0]=='*' ) continue; // overlapping deletion (*), do not modify int l = strlen(b[i]); b[i] = (char*) realloc(b[i],l+rla-rlb+1); memcpy(b[i]+l,a[0]+rlb,rla-rlb+1); @@ -565,13 +639,15 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) // now check if the $a alleles are present and if not add them for (i=1; irla ) // $a alleles need expanding + if ( rlb>rla && a[i][0]!='<' && a[i][0]!='*' ) // $a alleles need expanding and not a symbolic allele or * { int l = strlen(a[i]); ai = (char*) malloc(l+rlb-rla+1); memcpy(ai,a[i],l); memcpy(ai+l,b[0]+rla,rlb-rla+1); + const_ai = 0; } else ai = a[i]; @@ -582,42 +658,59 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) if ( j<*nb ) // $b already has the same allele { map[i] = j; - if ( rlb>rla ) free(ai); + if ( !const_ai ) free(ai); continue; } // new allele map[i] = *nb; - b[*nb] = rlb>rla ? ai : strdup(ai); + if ( b[*nb] ) free(b[*nb]); + b[*nb] = const_ai ? strdup(ai) : ai; (*nb)++; } return b; } -maux_t *maux_init(bcf_srs_t *files) +maux_t *maux_init(args_t *args) { + bcf_srs_t *files = args->files; maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t)); ma->n = files->nreaders; - ma->nbuf = (int *) calloc(ma->n,sizeof(int)); - ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*)); ma->files = files; int i, n_smpl = 0; for (i=0; in; i++) n_smpl += bcf_hdr_nsamples(files->readers[i].header); + if ( args->do_gvcf ) + { + ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); + for (i=0; in; i++) + ma->gvcf[i].line = bcf_init1(); + } ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int)); ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int)); - ma->has_line = (int*) malloc(ma->n*sizeof(int)); + ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); + for (i=0; in; i++) + ma->buf[i].rid = -1; return ma; } void maux_destroy(maux_t *ma) { - int i; + int i,j; + for (i=0; imals; i++) + { + free(ma->als[i]); + ma->als[i] = NULL; + } for (i=0; in; i++) // for each reader { - if ( !ma->d[i] ) continue; - int j; - for (j=0; jnbuf[i]; j++) // for each buffered line - if ( ma->d[i][j].map ) free(ma->d[i][j].map); - free(ma->d[i]); + for (j=0; jbuf[i].mrec; j++) // for each buffered line + free(ma->buf[i].rec[j].map); + free(ma->buf[i].rec); + } + free(ma->buf); + if ( ma->gvcf ) + { + for (i=0; in; i++) bcf_destroy(ma->gvcf[i].line); + free(ma->gvcf); } for (i=0; imAGR_info; i++) free(ma->AGR_info[i].buf); @@ -626,32 +719,69 @@ void maux_destroy(maux_t *ma) if (ma->ntmp_arr) free(ma->tmp_arr); if (ma->nfmt_map) free(ma->fmt_map); // ma->inf freed in bcf_destroy1 - free(ma->d); - free(ma->nbuf); for (i=0; imals; i++) free(ma->als[i]); if (ma->mout_als) free(ma->out_als); free(ma->als); free(ma->cnt); free(ma->smpl_ploidy); free(ma->smpl_nGsize); - free(ma->has_line); + free(ma->chr); free(ma); } -void maux_expand1(maux_t *ma, int i) +void maux_expand1(buffer_t *buf, int size) { - if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer ) + if ( buf->mrec < size ) { - int n = ma->files->readers[i].nbuffer + 1; - ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n); - memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i])); - ma->nbuf[i] = n; + hts_expand0(maux1_t,size,buf->mrec,buf->rec); + buf->mrec = size; } } void maux_reset(maux_t *ma) { - int i; - for (i=0; in; i++) maux_expand1(ma, i); - for (i=1; incnt; i++) ma->cnt[i] = 0; + int i,j; + for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); + for (i=0; incnt; i++) ma->cnt[i] = 0; + for (i=0; imals; i++) + { + free(ma->als[i]); + ma->als[i] = NULL; + } + const char *chr = NULL; + ma->nals = 0; + ma->pos = -1; + for (i=0; in; i++) + { + if ( !bcf_sr_has_line(ma->files,i) ) continue; + bcf1_t *line = bcf_sr_get_line(ma->files,i); + bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); + chr = bcf_seqname(hdr,line); + ma->pos = line->pos; + break; + } + if ( chr ) + { + free(ma->chr); + ma->chr = strdup(chr); + } + for (i=0; in; i++) + { + bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); + ma->buf[i].rid = bcf_hdr_name2id(hdr,chr); + ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1; + for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) + { + ma->buf[i].rec[j].skip = 0; + bcf1_t *line = ma->files->readers[i].buffer[j]; + if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break; + } + ma->buf[i].end = j; + ma->buf[i].cur = -1; + if ( ma->buf[i].beg < ma->buf[i].end ) + { + ma->buf[i].lines = ma->files->readers[i].buffer; + if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record + } + } } void maux_debug(maux_t *ma, int ir, int ib) { @@ -684,16 +814,20 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) out->pos = -1; for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + bcf1_t *line = maux_get_line(args, i); + if ( !line ) continue; + bcf_unpack(line, BCF_UN_ALL); bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; - // alleles + // not all maux alleles are always used, mark the ones we'll need int j; for (j=1; jn_allele; j++) - al_idxs[ ma->d[i][0].map[j] ] = 1; + { + int irec = ma->buf[i].cur; + al_idxs[ ma->buf[i].rec[irec].map[j] ] = 1; + } // position if ( out->pos==-1 ) @@ -717,16 +851,15 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) } // set QUAL to the max qual value. Not exactly correct, but good enough for now - if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) ) + if ( !bcf_float_is_missing(line->qual) ) { - if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual; + if ( bcf_float_is_missing(out->qual) || out->qual < line->qual ) out->qual = line->qual; } } // set ID if ( !tmps->l ) kputs(".", tmps); - if ( out->d.id ) free(out->d.id); - out->d.id = strdup(tmps->s); + bcf_update_id(out_hdr, out, tmps->s); // set alleles ma->nout_als = 0; @@ -740,10 +873,13 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int ir, j; for (ir=0; irnreaders; ir++) { - if ( !ma->has_line[ir] ) continue; - bcf1_t *line = files->readers[ir].buffer[0]; + bcf1_t *line = maux_get_line(args,ir); + if ( !line ) continue; for (j=1; jn_allele; j++) - if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als; + { + int irec = ma->buf[ir].cur; + if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als; + } } } // Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block. @@ -765,20 +901,36 @@ void merge_filter(args_t *args, bcf1_t *out) bcf_hdr_t *out_hdr = args->out_hdr; int i, ret; + if ( args->filter_logic == FLT_LOGIC_REMOVE ) + { + for (i=0; inreaders; i++) + { + bcf1_t *line = maux_get_line(args, i); + if ( !line ) continue; + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + if ( bcf_has_filter(hdr, line, "PASS") ) break; + } + if ( inreaders ) + { + int flt_id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS"); + bcf_add_filter(out_hdr, out, flt_id); + return; + } + } + khiter_t kitr; strdict_t *tmph = args->tmph; kh_clear(strdict, tmph); - maux_t *ma = args->maux; out->d.n_flt = 0; for (i=0; inreaders; i++) { - if ( !ma->has_line[i]) continue; + bcf1_t *line = maux_get_line(args, i); + if ( !line ) continue; bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; - bcf_unpack(line, BCF_UN_ALL); int k; for (k=0; kd.n_flt; k++) @@ -789,8 +941,8 @@ void merge_filter(args_t *args, bcf1_t *out) { int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt); if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt); - hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt); - ma->flt[out->d.n_flt] = id; + hts_expand(int,out->d.n_flt+1,out->d.m_flt,out->d.flt); + out->d.flt[out->d.n_flt] = id; out->d.n_flt++; kh_put(strdict, tmph, flt, &ret); } @@ -801,20 +953,17 @@ void merge_filter(args_t *args, bcf1_t *out) { int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS"); for (i=0; id.n_flt; i++) - if ( ma->flt[i]==id ) break; + if ( out->d.flt[i]==id ) break; if ( id.n_flt ) { out->d.n_flt--; - for (; id.n_flt; i++) ma->flt[i] = ma->flt[i+1]; + for (; id.n_flt; i++) out->d.flt[i] = out->d.flt[i+1]; } } - out->d.flt = ma->flt; } static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str) { - assert( !info->vptr_free ); - uint8_t *ptr = info->vptr - info->vptr_off; bcf_dec_typed_int1(ptr, &ptr); @@ -833,8 +982,6 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str); info->vptr = (uint8_t*) tmp_str->s + info->vptr_off; - info->vptr_free = 1; - line->d.shared_dirty |= BCF1_DIRTY_INF; tmp_str->s = NULL; tmp_str->m = 0; tmp_str->l = 0; @@ -1029,9 +1176,10 @@ void merge_info(args_t *args, bcf1_t *out) info_rules_reset(args); for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + bcf1_t *line = maux_get_line(args,i); + if ( !line ) continue; + int irec = ma->buf[i].cur; bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; for (j=0; jn_info; j++) { @@ -1050,7 +1198,7 @@ void merge_info(args_t *args, bcf1_t *out) info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key); if ( rule ) { - maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL; + maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->buf[i].rec[irec] : NULL; if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue; } } @@ -1061,7 +1209,7 @@ void merge_info(args_t *args, bcf1_t *out) { if ( kitr == kh_end(tmph) ) { - // first occurance in this reader, alloc arrays + // seeing this key for the first time ma->nAGR_info++; hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info); kitr = kh_put(strdict, tmph, key, &ret); @@ -1079,37 +1227,36 @@ void merge_info(args_t *args, bcf1_t *out) kitr = kh_get(strdict, tmph, key); int idx = kh_val(tmph, kitr); if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1); - merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]); + merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]); continue; } if ( kitr == kh_end(tmph) ) { - hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf); - ma->inf[out->n_info].key = id; - ma->inf[out->n_info].type = inf->type; - ma->inf[out->n_info].len = inf->len; - ma->inf[out->n_info].vptr = inf->vptr; - ma->inf[out->n_info].v1.i = inf->v1.i; - ma->inf[out->n_info].v1.f = inf->v1.f; - ma->inf[out->n_info].vptr_off = inf->vptr_off; - ma->inf[out->n_info].vptr_len = inf->vptr_len; - ma->inf[out->n_info].vptr_free = inf->vptr_free; + // Seeing this key for the first time. Although quite hacky, + // this is faster than anything else given the data structures.. + + hts_expand0(bcf_info_t,out->n_info+1,out->d.m_info,out->d.info); + out->d.info[out->n_info].key = id; + out->d.info[out->n_info].type = inf->type; + out->d.info[out->n_info].len = inf->len; + out->d.info[out->n_info].v1.i = inf->v1.i; + out->d.info[out->n_info].v1.f = inf->v1.f; + out->d.info[out->n_info].vptr_off = inf->vptr_off; + out->d.info[out->n_info].vptr_len = inf->vptr_len; + out->d.info[out->n_info].vptr_free = 1; + out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); + memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off); + out->d.info[out->n_info].vptr += inf->vptr_off; if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) ) - { - // The existing packed info cannot be reused. Change the id. - // Although quite hacky, it's faster than anything else given - // the data structures - bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps); - } + bcf_info_set_id(out, &out->d.info[out->n_info], id, &args->tmps); + out->d.shared_dirty |= BCF1_DIRTY_INF; out->n_info++; kitr = kh_put(strdict, tmph, key, &ret); kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value } } } - out->d.info = ma->inf; - out->d.m_info = ma->minf; for (i=0; inrules; i++) args->rules[i].merger(args->out_hdr, out, &args->rules[i]); for (i=0; inAGR_info; i++) @@ -1154,12 +1301,14 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); + int default_gt = args->missing_to_ref ? bcf_gt_unphased(0) : bcf_gt_missing; for (i=0; inreaders; i++) { bcf_sr_t *reader = &files->readers[i]; bcf_hdr_t *hdr = reader->header; bcf_fmt_t *fmt_ori = fmt_map[i]; int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize; + int irec = ma->buf[i].cur; int j, k; if ( !fmt_ori ) @@ -1167,7 +1316,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) // missing values: assume maximum ploidy for (j=0; jsmpl_ploidy[ismpl+j]++; } + for (k=0; ksmpl_ploidy[ismpl+j]++; } tmp += nsize; } ismpl += bcf_hdr_nsamples(hdr); @@ -1176,7 +1325,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) #define BRANCH(type_t, vector_end) { \ type_t *p_ori = (type_t*) fmt_ori->p; \ - if ( !ma->d[i][0].als_differ ) \ + if ( !ma->buf[i].rec[irec].als_differ ) \ { \ /* the allele numbering is unchanged */ \ for (j=0; j>1) - 1; \ - al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \ + al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \ tmp[k] = (al << 1) | ((p_ori[k])&1); \ } \ } \ @@ -1239,7 +1388,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int nsize = 0, length = BCF_VL_FIXED, type = -1; for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + if ( !maux_get_line(args,i) ) continue; if ( !fmt_map[i] ) continue; if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key; type = fmt_map[i]->type; @@ -1277,10 +1426,12 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) bcf_sr_t *reader = &files->readers[i]; bcf_hdr_t *hdr = reader->header; bcf_fmt_t *fmt_ori = fmt_map[i]; + bcf1_t *line = maux_get_line(args, i); + int irec = ma->buf[i].cur; if ( fmt_ori ) { type = fmt_ori->type; - int nals_ori = reader->buffer[0]->n_allele; + int nals_ori = line->n_allele; if ( length==BCF_VL_G ) { // if all fields are missing then n==1 is valid @@ -1313,10 +1464,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) ismpl += bcf_hdr_nsamples(hdr); \ continue; \ } \ - assert( ma->has_line[i] ); \ - bcf1_t *line = reader->buffer[0]; \ src_type_t *src = (src_type_t*) fmt_ori->p; \ - if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \ + if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \ { \ /* alleles unchanged, copy over */ \ for (j=0; jn_allele; iori++) \ { \ - inew = ma->d[i][0].map[iori]; \ + inew = ma->buf[i].rec[irec].map[iori]; \ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \ if ( src_is_vector_end ) break; \ @@ -1372,10 +1521,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int iori,jori, inew,jnew; \ for (iori=0; iorin_allele; iori++) \ { \ - inew = ma->d[i][0].map[iori]; \ + inew = ma->buf[i].rec[irec].map[iori]; \ for (jori=0; jori<=iori; jori++) \ { \ - jnew = ma->d[i][0].map[jori]; \ + jnew = ma->buf[i].rec[irec].map[jori]; \ int kori = iori*(iori+1)/2 + jori; \ int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \ @@ -1412,7 +1561,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int iori,inew; \ for (iori=ifrom; iorin_allele; iori++) \ { \ - inew = ma->d[i][0].map[iori] - ifrom; \ + inew = ma->buf[i].rec[irec].map[iori] - ifrom; \ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \ if ( src_is_vector_end ) break; \ if ( src_is_missing ) tgt_set_missing; \ @@ -1461,9 +1610,9 @@ void merge_format(args_t *args, bcf1_t *out) int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + bcf1_t *line = maux_get_line(args,i); + if ( !line ) continue; bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; for (j=0; jn_fmt; j++) { @@ -1495,9 +1644,10 @@ void merge_format(args_t *args, bcf1_t *out) ma->fmt_map[ifmt*files->nreaders+i] = fmt; } // Check if the allele numbering must be changed - for (j=1; jbuffer[0]->n_allele; j++) - if ( ma->d[i][0].map[j]!=j ) break; - ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1; + int irec = ma->buf[i].cur; + for (j=1; jn_allele; j++) + if ( ma->buf[i].rec[irec].map[j]!=j ) break; + ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1; } out->n_sample = bcf_hdr_nsamples(out_hdr); @@ -1505,203 +1655,383 @@ void merge_format(args_t *args, bcf1_t *out) merge_GT(args, ma->fmt_map, out); update_AN_AC(out_hdr, out); - if ( out->d.info!=ma->inf ) - { - // hacky, we rely on htslib internals: bcf_update_info() reallocated the info - ma->inf = out->d.info; - ma->minf = out->d.m_info; - } - for (i=1; i<=max_ifmt; i++) merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); out->d.indiv_dirty = 1; } -// The core merging function, one or none line from each reader -void merge_line(args_t *args) +void gvcf_set_alleles(args_t *args) +{ + int i,k; + bcf_srs_t *files = args->files; + maux_t *maux = args->maux; + gvcf_aux_t *gaux = maux->gvcf; + maux->nals = 0; + + for (i=0; inreaders; i++) + { + if ( !gaux[i].active ) continue; + bcf1_t *line = maux_get_line(args, i); + int irec = maux->buf[i].cur; + + hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map); + if ( !maux->nals ) // first record, copy the alleles to the output + { + maux->nals = line->n_allele; + hts_expand0(char*, maux->nals, maux->mals, maux->als); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + for (k=0; knals; k++) + { + if ( maux->als[k] ) free(maux->als[k]); + maux->als[k] = strdup(line->d.allele[k]); + maux->buf[i].rec[irec].map[k] = k; + } + } + else + { + maux->als = merge_alleles(line->d.allele, line->n_allele, maux->buf[i].rec[irec].map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) + { + bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); + error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1); + } + } + } +} + +/* + Output staged gVCF blocks, end is the last position of the block. Assuming + gaux[i].active flags are set and maux_get_line returns correct lines. +*/ +void gvcf_write_block(args_t *args, int start, int end) { + int i; + maux_t *maux = args->maux; + gvcf_aux_t *gaux = maux->gvcf; + assert(gaux); + + // Update POS + int min = INT_MAX; + char ref = 'N'; + for (i=0; ifiles->nreaders; i++) + { + if ( !gaux[i].active ) continue; + if ( ref=='N' && gaux[i].line->pos==start ) ref = gaux[i].line->d.allele[0][0]; + gaux[i].line->pos = start; + } + for (i=0; ifiles->nreaders; i++) + { + if ( !gaux[i].active ) continue; + if ( gaux[i].end < start ) + { + gaux[i].active = 0; + maux->buf[i].cur = -1; + continue; + } + gaux[i].line->d.allele[0][0] = ref; + if ( min > gaux[i].end ) min = gaux[i].end; + } + // Check for valid gVCF blocks in this region + if ( min==INT_MAX ) + { + assert(0); + maux->gvcf_min = 0; + return; + } + bcf1_t *out = args->out_line; - bcf_clear1(out); - out->unpacked = BCF_UN_ALL; + gvcf_set_alleles(args); + + // Merge the staged lines merge_chrom2qual(args, out); merge_filter(args, out); merge_info(args, out); merge_format(args, out); - bcf_write1(args->out_fh, args->out_hdr, out); -} + if ( args->gvcf_fai && out->d.allele[0][0]=='N' ) + { + int slen = 0; + char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen); + if (slen) + { + out->d.allele[0][0] = seq[0]; + free(seq); + } + } + // Update END boundary + if ( end > start ) + { + end++; + bcf_update_info_int32(args->out_hdr, out, "END", &end, 1); + } + else + bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); + bcf_write1(args->out_fh, args->out_hdr, out); + bcf_clear1(out); -void debug_buffers(FILE *fp, bcf_srs_t *files); -void debug_buffer(FILE *fp, bcf_sr_t *reader); -#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; } + // Inactivate blocks which do not extend beyond END and find new gvcf_min + min = INT_MAX; + for (i=0; ifiles->nreaders; i++) + { + if ( !gaux[i].active ) continue; + if ( gaux[i].end < end ) + { + gaux[i].active = 0; + maux->buf[i].cur = -1; + continue; + } + // next min END position bigger than the current one + if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1; + } + maux->gvcf_min = min==INT_MAX ? 0 : min; +} -// Clean the reader's buffer to and make it ready for the next next_line() call. -// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put -// the rest to the beggining. Then shorten the buffer so that the last element -// points to the last unfinished record. There are two special cases: the last -// line of the buffer typically has a different position and must stay at the -// end; next, the first record of the buffer must be one of those already -// printed, as it will be discarded by next_line(). -// -void shake_buffer(maux_t *maux, int ir, int pos) +/* + Flush staged gVCF blocks. Flush everything if there are no more lines + (done=1) or if there is a new chromosome. If still on the same chromosome, + all hanging blocks must be ended by creating new records: + A + 1 END=10 + B + 3 END=7 + C + 3 END=5 + out + 1 END=2 A . . + 3 END=5 A B C + 6 END=7 A B . + 8 END=10 A . . + +*/ +void gvcf_flush(args_t *args, int done) { - bcf_sr_t *reader = &maux->files->readers[ir]; - maux1_t *m = maux->d[ir]; - - if ( !reader->buffer ) return; - int i; - // FILE *fp = stdout; - // fprintf(fp," nbuf=%d\t", reader->nbuffer); for (i=0; inbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n"); - // debug_buffer(fp,reader); - // fprintf(fp,"--\n"); + maux_t *maux = args->maux; - int a = 1, b = reader->nbuffer; - if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards + if ( !maux->chr ) return; // first time here, nothing to flush - while ( abuffer[a], reader->buffer[b]); - SWAP(maux1_t, m[a], m[b]); - a++; - b--; - } + // Get current position and chromosome + for (i=0; in; i++) + if ( bcf_sr_has_line(maux->files,i) ) break; + bcf1_t *line = bcf_sr_get_line(maux->files,i); + bcf_hdr_t *hdr = bcf_sr_get_header(maux->files,i); - // position $a to the after the first unfinished record - while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++; + if ( !strcmp(maux->chr,bcf_seqname(hdr,line)) ) flush_until = line->pos; // still on the same chr + } - if ( anbuffer ) + // When called on a region, trim the blocks accordingly + int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; + if ( args->regs ) { - // there is a gap between the unfinished lines at the beggining and the - // last line. The last line must be brought forward to fill the gap - if ( reader->buffer[reader->nbuffer]->pos != pos ) + int rstart = -1, rend = -1; + if ( regidx_overlap(args->regs,maux->chr,start,flush_until,args->regs_itr) ) { - SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]); - SWAP(maux1_t, m[a], m[reader->nbuffer]); - reader->nbuffer = a; + // In case there are multiple regions, we treat them as one + rstart = args->regs_itr->beg; + while ( regitr_overlap(args->regs_itr) ) rend = args->regs_itr->end; } + if ( rstart > start ) start = rstart; + if ( rend < flush_until ) flush_until = rend+1; } - if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos ) + // output all finished blocks + while ( maux->gvcf_min && start < flush_until ) { - // the first record is unfinished, replace it with an empty line - // from the end of the buffer or else next_line will remove it - if ( reader->nbuffer + 1 >= maux->nbuf[ir] ) + // does the block end before the new line or is it interrupted? + int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until; + if ( start > tmp-1 ) break; + gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based + start = tmp; + } +} + +/* + Check incoming lines for new gVCF blocks, set pointer to the current source + buffer (gvcf or readers). In contrast to gvcf_flush, this function can be + called only after maux_reset as it relies on updated maux buffers. +*/ +void gvcf_stage(args_t *args, int pos) +{ + maux_t *maux = args->maux; + gvcf_aux_t *gaux = maux->gvcf; + bcf_srs_t *files = args->files; + int32_t *end = (int32_t*) maux->tmp_arr; + int i, nend = maux->ntmp_arr / sizeof(int32_t); + + maux->gvcf_break = -1; + maux->gvcf_min = INT_MAX; + for (i=0; inreaders; i++) + { + if ( gaux[i].active ) { - reader->nbuffer++; - maux_expand1(maux, ir); - reader->nbuffer--; - m = maux->d[ir]; + // gvcf block should not overlap with another record + if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; + maux->buf[i].beg = 0; + maux->buf[i].end = 1; + maux->buf[i].cur = 0; + continue; } - if ( reader->nbuffer+1 >= reader->mbuffer ) - error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer); - if ( reader->buffer[reader->nbuffer]->pos!=pos ) + // Does any of the lines have END set? It is enough to check only the + // first line, there should be no duplicate records with END in gVCF + + if ( maux->buf[i].beg==maux->buf[i].end ) continue; // no new record + + int irec = maux->buf[i].beg; + bcf_hdr_t *hdr = bcf_sr_get_header(files, i); + bcf1_t *line = args->files->readers[i].buffer[irec]; + int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); + if ( ret==1 ) { - // 4way swap - bcf1_t *tmp = reader->buffer[0]; - reader->buffer[0] = reader->buffer[reader->nbuffer+1]; - reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer]; - reader->buffer[reader->nbuffer] = tmp; - m[reader->nbuffer].skip = m[0].skip; - m[reader->nbuffer+1].skip = SKIP_DIFF; - reader->nbuffer++; + // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with + // an empty record: the gaux line must be kept until we reach its END. + gaux[i].active = 1; + gaux[i].end = end[0] - 1; + SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line); + gaux[i].line->pos = pos; + + maux->buf[i].lines = &gaux[i].line; + maux->buf[i].beg = 0; + maux->buf[i].end = 1; + maux->buf[i].cur = 0; + + // Set the rid,pos of the swapped line in the buffer or else the + // synced reader will have a problem with the next line + // + args->files->readers[i].buffer[irec]->rid = maux->buf[i].rid; + args->files->readers[i].buffer[irec]->pos = maux->pos; + + // Update block offsets + if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; } else - { - SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]); - SWAP(maux1_t, m[0], m[reader->nbuffer+1]); - } + maux->gvcf_break = line->pos; // must break the gvcf block } + maux->ntmp_arr = nend * sizeof(int32_t); + maux->tmp_arr = end; + if ( maux->gvcf_min==INT_MAX ) maux->gvcf_min = 0; +} + + +void debug_buffers(FILE *fp, bcf_srs_t *files); +void debug_buffer(FILE *fp, bcf_srs_t *files, int reader); + +/* + Flush all buffered and processed records with the same coordinate. + Note that synced reader discards buffer[0], so that needs to stay + untouched. +*/ +void clean_buffer(args_t *args) +{ + maux_t *ma = args->maux; + + int ir; + for (ir=0; irn; ir++) + { + // Invalidate pointer to reader's buffer or else gvcf_flush will attempt + // to use the old lines via maux_get_line() + if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; - // debug_buffer(fp,reader); - // fprintf(fp,"\t"); for (i=0; inbuffer; i++) fprintf(fp," %d", skip[i]); - // fprintf(fp,"\n\n"); + bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir); + if ( !reader->nbuffer ) continue; // nothing to clean - // set position of finished buffer[0] line to -1, otherwise swapping may - // bring it back after next_line() - reader->buffer[0]->pos = -1; + bcf1_t **buf = reader->buffer; + if ( buf[1]->rid!=ma->buf[ir].rid || buf[1]->pos!=ma->pos ) continue; // nothing to flush - // trim the buffer, remove finished lines from the end - i = reader->nbuffer; - while ( i>=1 && m[i--].skip&SKIP_DONE ) - reader->nbuffer--; + int a = 1, b = 2; + while ( b<=reader->nbuffer && buf[b]->rid==ma->buf[ir].rid && buf[b]->pos==ma->pos ) b++; + // b now points to the first line we want to preserve + while ( b<=reader->nbuffer ) + { + SWAP(bcf1_t*, buf[a], buf[b]); + a++; b++; + } + reader->nbuffer -= b-a; + } } -void debug_maux(args_t *args, int pos, int var_type) +void debug_maux(args_t *args) { bcf_srs_t *files = args->files; maux_t *maux = args->maux; int j,k,l; - fprintf(stderr,"Alleles to merge at %d\n", pos+1); + fprintf(stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals); for (j=0; jnreaders; j++) { bcf_sr_t *reader = &files->readers[j]; + buffer_t *buf = &maux->buf[j]; fprintf(stderr," reader %d: ", j); - for (k=0; k<=reader->nbuffer; k++) + for (k=buf->beg; kend; k++) { - if ( maux->d[j][k].skip==SKIP_DONE ) continue; + if ( buf->rec[k].skip & SKIP_DONE ) continue; bcf1_t *line = reader->buffer[k]; - if ( line->pos!=pos ) continue; fprintf(stderr,"\t"); - if ( maux->d[j][k].skip ) fprintf(stderr,"["); // this record will not be merged in this round + if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record will not be merged in this round for (l=0; ln_allele; l++) fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]); - if ( maux->d[j][k].skip ) fprintf(stderr,"]"); + if ( buf->rec[k].skip ) fprintf(stderr,"]"); } fprintf(stderr,"\n"); } fprintf(stderr," counts: "); - for (j=0; jnals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(stderr,"\n"); - for (j=0; jnreaders; j++) - { - bcf_sr_t *reader = &files->readers[j]; - fprintf(stderr," out %d: ", j); - for (k=0; k<=reader->nbuffer; k++) - { - if ( maux->d[j][k].skip==SKIP_DONE ) continue; - bcf1_t *line = reader->buffer[k]; - if ( line->pos!=pos ) continue; - if ( maux->d[j][k].skip ) continue; - fprintf(stderr,"\t"); - for (l=0; ln_allele; l++) - fprintf(stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]); - } - fprintf(stderr,"\n"); - } - fprintf(stderr,"\n"); + for (j=0; jnals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); + fprintf(stderr,"\n\n"); } -// Determine which line should be merged from which reader: go through all -// readers and all buffered lines, expand REF,ALT and try to match lines with -// the same ALTs. A step towards output independent on input ordering of the -// lines. -void merge_buffer(args_t *args) + +/* + Determine which line should be merged from which reader: go through all + readers and all buffered lines, expand REF,ALT and try to match lines with + the same ALTs. + */ +int can_merge(args_t *args) { bcf_srs_t *files = args->files; - int i, pos = -1, var_type = 0; - char *id = NULL; + int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; maux_t *maux = args->maux; - maux_reset(maux); + gvcf_aux_t *gaux = maux->gvcf; + char *id = NULL, ref = 'N'; + maux->var_types = maux->nals = 0; - // set the current position + int i,j,k, ntodo = 0; for (i=0; inreaders; i++) { - if ( bcf_sr_has_line(files,i) ) + buffer_t *buf = &maux->buf[i]; + + if ( gaux && gaux[i].active ) { - bcf1_t *line = bcf_sr_get_line(files,i); - pos = line->pos; - var_type = bcf_get_variant_types(line); - id = line->d.id; - break; + // skip readers with active gvcf blocks + buf->rec[buf->beg].skip = SKIP_DIFF; + continue; + } + for (j=buf->beg; jend; j++) + { + if ( buf->rec[j].skip & SKIP_DONE ) continue; + + buf->rec[j].skip = SKIP_DIFF; + ntodo++; + + if ( args->merge_by_id ) + id = buf->lines[j]->d.id; + else + { + int var_type = bcf_get_variant_types(buf->lines[j]); + maux->var_types |= var_type ? var_type<<1 : 1; + } } + + // for gvcf: find out REF at this position + if ( buf->beg < buf->end && ref=='N' ) + ref = buf->lines[buf->beg]->d.allele[0][0]; } + if ( !ntodo ) return 0; // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this @@ -1710,19 +2040,24 @@ void merge_buffer(args_t *args) for (i=0; inreaders; i++) { bcf_sr_t *reader = &files->readers[i]; - if ( !reader->buffer ) continue; - int j, k; - for (j=0; j<=reader->nbuffer; j++) + buffer_t *buf = &maux->buf[i]; + + if ( gaux && gaux[i].active ) { - bcf1_t *line = reader->buffer[j]; + gaux[i].line->d.allele[0][0] = ref; + gaux[i].line->pos = maux->pos; + } + + for (j=buf->beg; jend; j++) + { + if ( buf->rec[j].skip & SKIP_DONE ) continue; + + bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer + int line_type = bcf_get_variant_types(line); + line_type = line_type ? line_type<<1 : 1; + // select relevant lines - maux->d[i][j].skip = SKIP_DIFF; - if ( pos!=line->pos ) - { - if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore - continue; - } if ( args->merge_by_id ) { if ( strcmp(id,line->d.id) ) continue; @@ -1733,30 +2068,30 @@ void merge_buffer(args_t *args) { // All alleles of the tested record must be present in the // selected maux record plus variant types must be the same - if ( var_type!=line->d.var_type ) continue; + if ( (maux->var_types & line_type) != line_type ) continue; if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible for (k=1; kn_allele; k++) { if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break; } - if ( k==line->n_allele ) continue; // no matching allele + if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele } if ( !(args->collapse&COLLAPSE_ANY) ) { - int compatible = 0; - if ( line_type==var_type ) compatible = 1; - else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything - else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1; - else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1; - else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1; - else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1; - else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1; - if ( !compatible ) continue; + // Merge: + // - SNPs+SNPs+MNPs+REF if -m both,snps + // - indels+indels+REF if -m both,indels, REF only if SNPs are not present + // - SNPs come first + if ( line_type & indel_mask ) + { + if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first + if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks + } } } - maux->d[i][j].skip = 0; + buf->rec[j].skip = 0; - hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map); + hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map); if ( !maux->nals ) // first record, copy the alleles to the output { maux->nals = line->n_allele; @@ -1764,111 +2099,118 @@ void merge_buffer(args_t *args) hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); for (k=0; knals; k++) { + free(maux->als[k]); maux->als[k] = strdup(line->d.allele[k]); - maux->d[i][j].map[k] = k; + buf->rec[j].map[k] = k; maux->cnt[k] = 1; } - pos = line->pos; continue; } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals); - if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname); + maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(args->out_hdr,line),line->pos+1,reader->fname); hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); for (k=1; kn_allele; k++) - maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files + maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files maux->cnt[0]++; } } + return 1; +} - // debug_maux(args, pos, var_type); +/* + Select records that have the same alleles; the input ordering of indels + must not matter. Multiple VCF lines can be emitted from this loop. + We expect only very few alleles and not many records with the same + position in the buffers, therefore the nested loops should not slow us + much. +*/ +void stage_line(args_t *args) +{ + int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; + bcf_srs_t *files = args->files; + maux_t *maux = args->maux; - // Select records that have the same alleles; the input ordering of indels - // must not matter. Multiple VCF lines can be emitted from this loop. - // We expect only very few alleles and not many records with the same - // position in the buffers, therefore the nested loops should not slow us - // much. - while (1) + // debug_maux(args); + + // take the most frequent allele present in multiple files, REF is skipped + int i,j,k,icnt = 1; + for (i=2; inals; i++) + if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; + + int nout = 0; + for (i=0; inreaders; i++) { - // take the most frequent allele present in multiple files - int icnt = 0; - for (i=1; inals; i++) - if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; - if ( maux->cnt[icnt]<0 ) break; + buffer_t *buf = &maux->buf[i]; + buf->cur = -1; + if ( buf->beg >= buf->end ) continue; // no lines in the buffer - int nmask = 0; - for (i=0; inreaders; i++) + // find lines with the same allele + for (j=buf->beg; jend; j++) { - maux->has_line[i] = 0; + if ( buf->rec[j].skip ) continue; // done or not compatible + if ( args->merge_by_id ) break; + if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record - bcf_sr_t *reader = &files->readers[i]; - if ( !reader->buffer ) continue; + for (k=0; klines[j]->n_allele; k++) + if ( icnt==buf->rec[j].map[k] ) break; - // find lines with the same allele - int j; - for (j=0; j<=reader->nbuffer; j++) - { - if ( maux->d[i][j].skip ) continue; - int k; - for (k=0; kbuffer[j]->n_allele; k++) - if ( icnt==maux->d[i][j].map[k] ) break; - if ( kbuffer[j]->n_allele ) break; - } - if ( j>reader->nbuffer ) - { - // no matching allele found in this file - if ( args->collapse==COLLAPSE_NONE ) continue; + if ( klines[j]->n_allele ) break; + } + if ( j>=buf->end ) + { + // no matching allele found in this file + if ( args->collapse==COLLAPSE_NONE ) continue; - for (j=0; j<=reader->nbuffer; j++) + for (j=buf->beg; jend; j++) + { + if ( buf->rec[j].skip ) continue; // done or not compatible + if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged + int line_type = bcf_get_variant_types(buf->lines[j]); + if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; + if ( line_type==VCF_REF ) { - if ( maux->d[i][j].skip ) continue; - if ( args->collapse&COLLAPSE_ANY ) break; - int line_type = bcf_get_variant_types(reader->buffer[j]); - if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - if ( line_type==VCF_REF ) - { - if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - } - else if ( var_type==VCF_REF ) - { - if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - } + if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ref_mask ) break; } - } - if ( j<=reader->nbuffer ) - { - // found a suitable line for merging, place it at the beggining - if ( j>0 ) + else if ( maux->var_types&ref_mask ) { - SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]); - SWAP(maux1_t, maux->d[i][0], maux->d[i][j]); + if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; } - // mark as finished so that it's ignored next time - maux->d[i][0].skip |= SKIP_DONE; - maux->has_line[i] = 1; - nmask++; } } - if ( !nmask ) break; // done, no more lines suitable for merging found - merge_line(args); // merge and output the line - maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished + if ( jend ) + { + // found a suitable line for merging + buf->cur = j; + + // mark as finished so that it's ignored next time + buf->rec[j].skip = SKIP_DONE; + nout++; + } } + assert( nout ); +} - // clean the alleles - for (i=0; inals; i++) +void merge_line(args_t *args) +{ + if ( args->regs ) { - free(maux->als[i]); - maux->als[i] = 0; + if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return; } - maux->nals = 0; - // get the buffers ready for the next next_line() call - for (i=0; inreaders; i++) - shake_buffer(maux, i, pos); + bcf1_t *out = args->out_line; + merge_chrom2qual(args, out); + merge_filter(args, out); + merge_info(args, out); + if ( args->do_gvcf ) + bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); + merge_format(args, out); + bcf_write1(args->out_fh, args->out_hdr, out); + bcf_clear1(out); } void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd) @@ -1887,6 +2229,8 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c else ksprintf(&str, " %s", argv[i]); } + kputs("; Date=", &str); + time_t tm; time(&tm); kputs(ctime(&tm), &str); kputc('\n', &str); bcf_hdr_append(hdr,str.s); free(str.s); @@ -1898,7 +2242,7 @@ void merge_vcf(args_t *args) { args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); if ( args->header_fname ) @@ -1928,14 +2272,32 @@ void merge_vcf(args_t *args) } if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init(); - args->maux = maux_init(args->files); + args->maux = maux_init(args); args->out_line = bcf_init1(); args->tmph = kh_init(strdict); - int ret; - while ( (ret=bcf_sr_next_line(args->files)) ) + + while ( bcf_sr_next_line(args->files) ) { - merge_buffer(args); + // output cached gVCF blocks which end before the new record + if ( args->do_gvcf ) + gvcf_flush(args,0); + + maux_reset(args->maux); + + // determine which of the new records are gvcf blocks + if ( args->do_gvcf ) + gvcf_stage(args, args->maux->pos); + + while ( can_merge(args) ) + { + stage_line(args); + merge_line(args); + } + clean_buffer(args); } + if ( args->do_gvcf ) + gvcf_flush(args,1); + info_rules_destroy(args); maux_destroy(args->maux); bcf_hdr_destroy(args->out_hdr); @@ -1958,7 +2320,10 @@ static void usage(void) fprintf(stderr, " --force-samples resolve duplicate sample names\n"); fprintf(stderr, " --print-header print only the merged header and exit\n"); fprintf(stderr, " --use-header use the provided header\n"); + fprintf(stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n"); fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(stderr, " -F, --filter-logic remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); + fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); fprintf(stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(stderr, " -l, --file-list read file names from the file\n"); fprintf(stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); @@ -1989,7 +2354,9 @@ int main_vcfmerge(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"merge",required_argument,NULL,'m'}, + {"gvcf",required_argument,NULL,'g'}, {"file-list",required_argument,NULL,'l'}, + {"missing-to-ref",no_argument,NULL,'0'}, {"apply-filters",required_argument,NULL,'f'}, {"use-header",required_argument,NULL,1}, {"print-header",no_argument,NULL,2}, @@ -2001,10 +2368,25 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"info-rules",required_argument,NULL,'i'}, {"no-version",no_argument,NULL,8}, + {"filter-logic",required_argument,NULL,'F'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) { switch (c) { + case 'F': + if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; + else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; + else error("Filter logic not recognised: %s\n", optarg); + break; + case '0': args->missing_to_ref = 1; break; + case 'g': + args->do_gvcf = 1; + if ( strcmp("-",optarg) ) + { + args->gvcf_fai = fai_load(optarg); + if ( !args->gvcf_fai ) error("Failed to load the fai index: %s\n", optarg); + } + break; case 'l': args->file_list = optarg; break; case 'i': args->info_rules = optarg; break; case 'o': args->output_fname = optarg; break; @@ -2045,9 +2427,23 @@ int main_vcfmerge(int argc, char *argv[]) if ( argc-optind<2 && !args->file_list ) usage(); args->files->require_index = 1; - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->regions_list ) + { + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + if ( regions_is_file ) + args->regs = regidx_init(args->regions_list,NULL,NULL,sizeof(char*),NULL); + else + { + args->regs = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); + if ( regidx_insert_list(args->regs,args->regions_list,',') !=0 ) error("Could not parse the regions: %s\n", args->regions_list); + regidx_insert(args->regs,NULL); + } + if ( !args->regs ) error("Could not parse the regions: %s\n", args->regions_list); + args->regs_itr = regitr_init(args->regs); + } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); while (optindfiles, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); @@ -2065,6 +2461,9 @@ int main_vcfmerge(int argc, char *argv[]) } merge_vcf(args); bcf_sr_destroy(args->files); + if ( args->regs ) regidx_destroy(args->regs); + if ( args->regs_itr ) regitr_destroy(args->regs_itr); + if ( args->gvcf_fai ) fai_destroy(args->gvcf_fai); free(args); return 0; } diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index daac458..db9aff5 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -2,7 +2,7 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -26,28 +26,39 @@ THE SOFTWARE. */ #include #include +#include #include #include #include #include #include #include +#include #include #include +#include #include "bcftools.h" +#include "regidx.h" #include "vcmp.h" +#define DBG 0 + #include KHASH_MAP_INIT_STR(strdict, int) typedef khash_t(strdict) strdict_t; -#define SKIP_DONE 1 -#define SKIP_DIFF 2 +#define FLT_LOGIC_ADD 0 +#define FLT_LOGIC_REMOVE 1 + +#define SKIP_DONE 1 // the record was processed +#define SKIP_DIFF 2 // not compatible, merge later #define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G) #define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A) #define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R) +#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; } + // For merging INFO Number=A,G,R tags typedef struct { @@ -65,43 +76,61 @@ typedef struct _info_rule_t void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule); int type; // one of BCF_HT_* int block_size; // number of values in a block + int type_size; // size of the corresponding BCF_HT_* type int nblocks; // number of blocks in nvals (the number of merged files) int nvals, mvals; // used and total size of vals array void *vals; // the info tag values } info_rule_t; +typedef struct +{ + bcf1_t *line; + int end, active; +} +gvcf_aux_t; + // Auxiliary merge data for selecting the right combination // of buffered records across multiple readers. maux1_t // corresponds to one buffered line. typedef struct { int skip; - int *map; // mapping from input alleles to the output array + int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles) int mmap; // size of map array (only buffer[i].n_allele is actually used) int als_differ; } maux1_t; typedef struct { - int n; // number of readers + int rid; // current rid + int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush. + int cur; // current line or -1 if none + int npos; // number of unprocessed lines at this position + int mrec; // allocated size of buf + maux1_t *rec; // buffer to keep reader's lines + bcf1_t **lines; // source buffer: either gvcf or readers' buffer +} +buffer_t; +typedef struct +{ + int n, pos, var_types; // number of readers, current position, currently available variant types + char *chr; // current chromosome char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output int nals, mals, nout_als, mout_als; // size of the output array int *cnt, ncnt; // number of records that refer to the alleles - int *nbuf; // readers have buffers of varying lengths int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases) - int *flt, mflt, minf; - bcf_info_t *inf;// out_line's INFO fields bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT int nfmt_map; // number of rows in the fmt_map array int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes void *tmp_arr; int ntmp_arr; - maux1_t **d; // d[i][j] i-th reader, j-th buffer line + buffer_t *buf; AGR_info_t *AGR_info; int nAGR_info, mAGR_info; bcf_srs_t *files; - int *has_line; // which files are being merged + int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present + gvcf_aux_t *gvcf; // buffer of gVCF lines } maux_t; @@ -109,8 +138,11 @@ typedef struct { vcmp_t *vcmp; maux_t *maux; - int header_only, collapse, output_type, force_samples, merge_by_id; + regidx_t *regs; // apply regions only after the blocks are expanded + regitr_t *regs_itr; + int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref; char *header_fname, *output_fname, *regions_list, *info_rules, *file_list; + faidx_t *gvcf_fai; info_rule_t *rules; int nrules; strdict_t *tmph; @@ -124,6 +156,14 @@ typedef struct } args_t; +static bcf1_t *maux_get_line(args_t *args, int i) +{ + maux_t *ma = args->maux; + int ibuf = ma->buf[i].cur; + if ( ibuf >= 0 ) return ma->buf[i].lines[ibuf]; + return NULL; +} + static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule) { if ( !rule->nvals ) return; @@ -249,6 +289,32 @@ static void info_rules_init(args_t *args) if ( str.l ) kputc(',',&str); kputs("DP4:sum",&str); } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "QS")) ) + { + if ( str.l ) kputc(',',&str); + kputs("QS:sum",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) ) + { + if ( str.l ) kputc(',',&str); + kputs("MinDP:min",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "I16")) ) + { + if ( str.l ) kputc(',',&str); + kputs("I16:sum",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IDV")) ) + { + if ( str.l ) kputc(',',&str); + kputs("IDV:max",&str); + } + if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IMF")) ) + { + if ( str.l ) kputc(',',&str); + kputs("IMF:max",&str); + } + if ( !str.l ) return; args->info_rules = str.s; } @@ -274,9 +340,12 @@ static void info_rules_init(args_t *args) int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag); rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); - if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag); + if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); + else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); + else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); + else error("The type is not supported: \"%s\"\n", rule->hdr_tag); - while ( *ss ) ss++; ss++; + ss = strchr(ss, '\0'); ss++; if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag); int is_join = 0; @@ -302,7 +371,8 @@ static void info_rules_init(args_t *args) error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag); } - while ( *ss ) ss++; ss++; n++; + ss = strchr(ss, '\0'); ss++; + n++; } free(str.s); free(tmp); @@ -328,8 +398,10 @@ static void info_rules_reset(args_t *args) } static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len) { - int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type); + int msize = args->maux->ntmp_arr / rule->type_size; + int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type); if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret); + args->maux->ntmp_arr = msize * rule->type_size; rule->nblocks++; @@ -347,7 +419,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf int i, j; if ( var_len==BCF_VL_A ) { - assert( ret==line->n_allele-1 ); + if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); args->maux->nagr_map = ret; hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); // create mapping from source file ALT indexes to dst file indexes @@ -356,7 +428,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf } else if ( var_len==BCF_VL_R ) { - assert( ret==line->n_allele ); + if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); args->maux->nagr_map = ret; hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); for (i=0; imaux->agr_map[i] = als->map[i]; @@ -558,6 +630,8 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) { for (i=0; i<*nb; i++) { + if ( b[i][0]=='<' ) continue; // symbolic allele, do not modify + if ( b[i][0]=='*' ) continue; // overlapping deletion (*), do not modify int l = strlen(b[i]); b[i] = (char*) realloc(b[i],l+rla-rlb+1); memcpy(b[i]+l,a[0]+rlb,rla-rlb+1); @@ -567,13 +641,15 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) // now check if the $a alleles are present and if not add them for (i=1; irla ) // $a alleles need expanding + if ( rlb>rla && a[i][0]!='<' && a[i][0]!='*' ) // $a alleles need expanding and not a symbolic allele or * { int l = strlen(a[i]); ai = (char*) malloc(l+rlb-rla+1); memcpy(ai,a[i],l); memcpy(ai+l,b[0]+rla,rlb-rla+1); + const_ai = 0; } else ai = a[i]; @@ -584,42 +660,59 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) if ( j<*nb ) // $b already has the same allele { map[i] = j; - if ( rlb>rla ) free(ai); + if ( !const_ai ) free(ai); continue; } // new allele map[i] = *nb; - b[*nb] = rlb>rla ? ai : strdup(ai); + if ( b[*nb] ) free(b[*nb]); + b[*nb] = const_ai ? strdup(ai) : ai; (*nb)++; } return b; } -maux_t *maux_init(bcf_srs_t *files) +maux_t *maux_init(args_t *args) { + bcf_srs_t *files = args->files; maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t)); ma->n = files->nreaders; - ma->nbuf = (int *) calloc(ma->n,sizeof(int)); - ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*)); ma->files = files; int i, n_smpl = 0; for (i=0; in; i++) n_smpl += bcf_hdr_nsamples(files->readers[i].header); + if ( args->do_gvcf ) + { + ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); + for (i=0; in; i++) + ma->gvcf[i].line = bcf_init1(); + } ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int)); ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int)); - ma->has_line = (int*) malloc(ma->n*sizeof(int)); + ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); + for (i=0; in; i++) + ma->buf[i].rid = -1; return ma; } void maux_destroy(maux_t *ma) { - int i; + int i,j; + for (i=0; imals; i++) + { + free(ma->als[i]); + ma->als[i] = NULL; + } for (i=0; in; i++) // for each reader { - if ( !ma->d[i] ) continue; - int j; - for (j=0; jnbuf[i]; j++) // for each buffered line - if ( ma->d[i][j].map ) free(ma->d[i][j].map); - free(ma->d[i]); + for (j=0; jbuf[i].mrec; j++) // for each buffered line + free(ma->buf[i].rec[j].map); + free(ma->buf[i].rec); + } + free(ma->buf); + if ( ma->gvcf ) + { + for (i=0; in; i++) bcf_destroy(ma->gvcf[i].line); + free(ma->gvcf); } for (i=0; imAGR_info; i++) free(ma->AGR_info[i].buf); @@ -628,32 +721,69 @@ void maux_destroy(maux_t *ma) if (ma->ntmp_arr) free(ma->tmp_arr); if (ma->nfmt_map) free(ma->fmt_map); // ma->inf freed in bcf_destroy1 - free(ma->d); - free(ma->nbuf); for (i=0; imals; i++) free(ma->als[i]); if (ma->mout_als) free(ma->out_als); free(ma->als); free(ma->cnt); free(ma->smpl_ploidy); free(ma->smpl_nGsize); - free(ma->has_line); + free(ma->chr); free(ma); } -void maux_expand1(maux_t *ma, int i) +void maux_expand1(buffer_t *buf, int size) { - if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer ) + if ( buf->mrec < size ) { - int n = ma->files->readers[i].nbuffer + 1; - ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n); - memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i])); - ma->nbuf[i] = n; + hts_expand0(maux1_t,size,buf->mrec,buf->rec); + buf->mrec = size; } } void maux_reset(maux_t *ma) { - int i; - for (i=0; in; i++) maux_expand1(ma, i); - for (i=1; incnt; i++) ma->cnt[i] = 0; + int i,j; + for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); + for (i=0; incnt; i++) ma->cnt[i] = 0; + for (i=0; imals; i++) + { + free(ma->als[i]); + ma->als[i] = NULL; + } + const char *chr = NULL; + ma->nals = 0; + ma->pos = -1; + for (i=0; in; i++) + { + if ( !bcf_sr_has_line(ma->files,i) ) continue; + bcf1_t *line = bcf_sr_get_line(ma->files,i); + bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); + chr = bcf_seqname(hdr,line); + ma->pos = line->pos; + break; + } + if ( chr ) + { + free(ma->chr); + ma->chr = strdup(chr); + } + for (i=0; in; i++) + { + bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); + ma->buf[i].rid = bcf_hdr_name2id(hdr,chr); + ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1; + for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) + { + ma->buf[i].rec[j].skip = 0; + bcf1_t *line = ma->files->readers[i].buffer[j]; + if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break; + } + ma->buf[i].end = j; + ma->buf[i].cur = -1; + if ( ma->buf[i].beg < ma->buf[i].end ) + { + ma->buf[i].lines = ma->files->readers[i].buffer; + if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record + } + } } void maux_debug(maux_t *ma, int ir, int ib) { @@ -686,16 +816,20 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) out->pos = -1; for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + bcf1_t *line = maux_get_line(args, i); + if ( !line ) continue; + bcf_unpack(line, BCF_UN_ALL); bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; - // alleles + // not all maux alleles are always used, mark the ones we'll need int j; for (j=1; jn_allele; j++) - al_idxs[ ma->d[i][0].map[j] ] = 1; + { + int irec = ma->buf[i].cur; + al_idxs[ ma->buf[i].rec[irec].map[j] ] = 1; + } // position if ( out->pos==-1 ) @@ -719,16 +853,15 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) } // set QUAL to the max qual value. Not exactly correct, but good enough for now - if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) ) + if ( !bcf_float_is_missing(line->qual) ) { - if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual; + if ( bcf_float_is_missing(out->qual) || out->qual < line->qual ) out->qual = line->qual; } } // set ID if ( !tmps->l ) kputs(".", tmps); - if ( out->d.id ) free(out->d.id); - out->d.id = strdup(tmps->s); + bcf_update_id(out_hdr, out, tmps->s); // set alleles ma->nout_als = 0; @@ -742,10 +875,13 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int ir, j; for (ir=0; irnreaders; ir++) { - if ( !ma->has_line[ir] ) continue; - bcf1_t *line = files->readers[ir].buffer[0]; + bcf1_t *line = maux_get_line(args,ir); + if ( !line ) continue; for (j=1; jn_allele; j++) - if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als; + { + int irec = ma->buf[ir].cur; + if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als; + } } } // Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block. @@ -767,20 +903,36 @@ void merge_filter(args_t *args, bcf1_t *out) bcf_hdr_t *out_hdr = args->out_hdr; int i, ret; + if ( args->filter_logic == FLT_LOGIC_REMOVE ) + { + for (i=0; inreaders; i++) + { + bcf1_t *line = maux_get_line(args, i); + if ( !line ) continue; + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + if ( bcf_has_filter(hdr, line, "PASS") ) break; + } + if ( inreaders ) + { + int flt_id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS"); + bcf_add_filter(out_hdr, out, flt_id); + return; + } + } + khiter_t kitr; strdict_t *tmph = args->tmph; kh_clear(strdict, tmph); - maux_t *ma = args->maux; out->d.n_flt = 0; for (i=0; inreaders; i++) { - if ( !ma->has_line[i]) continue; + bcf1_t *line = maux_get_line(args, i); + if ( !line ) continue; bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; - bcf_unpack(line, BCF_UN_ALL); int k; for (k=0; kd.n_flt; k++) @@ -791,8 +943,8 @@ void merge_filter(args_t *args, bcf1_t *out) { int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt); if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt); - hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt); - ma->flt[out->d.n_flt] = id; + hts_expand(int,out->d.n_flt+1,out->d.m_flt,out->d.flt); + out->d.flt[out->d.n_flt] = id; out->d.n_flt++; kh_put(strdict, tmph, flt, &ret); } @@ -803,20 +955,17 @@ void merge_filter(args_t *args, bcf1_t *out) { int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS"); for (i=0; id.n_flt; i++) - if ( ma->flt[i]==id ) break; + if ( out->d.flt[i]==id ) break; if ( id.n_flt ) { out->d.n_flt--; - for (; id.n_flt; i++) ma->flt[i] = ma->flt[i+1]; + for (; id.n_flt; i++) out->d.flt[i] = out->d.flt[i+1]; } } - out->d.flt = ma->flt; } static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str) { - assert( !info->vptr_free ); - uint8_t *ptr = info->vptr - info->vptr_off; bcf_dec_typed_int1(ptr, &ptr); @@ -835,8 +984,6 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str); info->vptr = (uint8_t*) tmp_str->s + info->vptr_off; - info->vptr_free = 1; - line->d.shared_dirty |= BCF1_DIRTY_INF; tmp_str->s = NULL; tmp_str->m = 0; tmp_str->l = 0; @@ -1031,9 +1178,10 @@ void merge_info(args_t *args, bcf1_t *out) info_rules_reset(args); for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + bcf1_t *line = maux_get_line(args,i); + if ( !line ) continue; + int irec = ma->buf[i].cur; bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; for (j=0; jn_info; j++) { @@ -1052,7 +1200,7 @@ void merge_info(args_t *args, bcf1_t *out) info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key); if ( rule ) { - maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL; + maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->buf[i].rec[irec] : NULL; if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue; } } @@ -1063,7 +1211,7 @@ void merge_info(args_t *args, bcf1_t *out) { if ( kitr == kh_end(tmph) ) { - // first occurance in this reader, alloc arrays + // seeing this key for the first time ma->nAGR_info++; hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info); kitr = kh_put(strdict, tmph, key, &ret); @@ -1081,37 +1229,36 @@ void merge_info(args_t *args, bcf1_t *out) kitr = kh_get(strdict, tmph, key); int idx = kh_val(tmph, kitr); if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1); - merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]); + merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]); continue; } if ( kitr == kh_end(tmph) ) { - hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf); - ma->inf[out->n_info].key = id; - ma->inf[out->n_info].type = inf->type; - ma->inf[out->n_info].len = inf->len; - ma->inf[out->n_info].vptr = inf->vptr; - ma->inf[out->n_info].v1.i = inf->v1.i; - ma->inf[out->n_info].v1.f = inf->v1.f; - ma->inf[out->n_info].vptr_off = inf->vptr_off; - ma->inf[out->n_info].vptr_len = inf->vptr_len; - ma->inf[out->n_info].vptr_free = inf->vptr_free; + // Seeing this key for the first time. Although quite hacky, + // this is faster than anything else given the data structures.. + + hts_expand0(bcf_info_t,out->n_info+1,out->d.m_info,out->d.info); + out->d.info[out->n_info].key = id; + out->d.info[out->n_info].type = inf->type; + out->d.info[out->n_info].len = inf->len; + out->d.info[out->n_info].v1.i = inf->v1.i; + out->d.info[out->n_info].v1.f = inf->v1.f; + out->d.info[out->n_info].vptr_off = inf->vptr_off; + out->d.info[out->n_info].vptr_len = inf->vptr_len; + out->d.info[out->n_info].vptr_free = 1; + out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); + memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off); + out->d.info[out->n_info].vptr += inf->vptr_off; if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) ) - { - // The existing packed info cannot be reused. Change the id. - // Although quite hacky, it's faster than anything else given - // the data structures - bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps); - } + bcf_info_set_id(out, &out->d.info[out->n_info], id, &args->tmps); + out->d.shared_dirty |= BCF1_DIRTY_INF; out->n_info++; kitr = kh_put(strdict, tmph, key, &ret); kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value } } } - out->d.info = ma->inf; - out->d.m_info = ma->minf; for (i=0; inrules; i++) args->rules[i].merger(args->out_hdr, out, &args->rules[i]); for (i=0; inAGR_info; i++) @@ -1156,12 +1303,14 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); + int default_gt = args->missing_to_ref ? bcf_gt_unphased(0) : bcf_gt_missing; for (i=0; inreaders; i++) { bcf_sr_t *reader = &files->readers[i]; bcf_hdr_t *hdr = reader->header; bcf_fmt_t *fmt_ori = fmt_map[i]; int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize; + int irec = ma->buf[i].cur; int j, k; if ( !fmt_ori ) @@ -1169,7 +1318,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) // missing values: assume maximum ploidy for (j=0; jsmpl_ploidy[ismpl+j]++; } + for (k=0; ksmpl_ploidy[ismpl+j]++; } tmp += nsize; } ismpl += bcf_hdr_nsamples(hdr); @@ -1178,7 +1327,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) #define BRANCH(type_t, vector_end) { \ type_t *p_ori = (type_t*) fmt_ori->p; \ - if ( !ma->d[i][0].als_differ ) \ + if ( !ma->buf[i].rec[irec].als_differ ) \ { \ /* the allele numbering is unchanged */ \ for (j=0; j>1) - 1; \ - al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \ + al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \ tmp[k] = (al << 1) | ((p_ori[k])&1); \ } \ } \ @@ -1241,7 +1390,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int nsize = 0, length = BCF_VL_FIXED, type = -1; for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + if ( !maux_get_line(args,i) ) continue; if ( !fmt_map[i] ) continue; if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key; type = fmt_map[i]->type; @@ -1279,10 +1428,12 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) bcf_sr_t *reader = &files->readers[i]; bcf_hdr_t *hdr = reader->header; bcf_fmt_t *fmt_ori = fmt_map[i]; + bcf1_t *line = maux_get_line(args, i); + int irec = ma->buf[i].cur; if ( fmt_ori ) { type = fmt_ori->type; - int nals_ori = reader->buffer[0]->n_allele; + int nals_ori = line->n_allele; if ( length==BCF_VL_G ) { // if all fields are missing then n==1 is valid @@ -1315,10 +1466,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) ismpl += bcf_hdr_nsamples(hdr); \ continue; \ } \ - assert( ma->has_line[i] ); \ - bcf1_t *line = reader->buffer[0]; \ src_type_t *src = (src_type_t*) fmt_ori->p; \ - if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \ + if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \ { \ /* alleles unchanged, copy over */ \ for (j=0; jn_allele; iori++) \ { \ - inew = ma->d[i][0].map[iori]; \ + inew = ma->buf[i].rec[irec].map[iori]; \ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \ if ( src_is_vector_end ) break; \ @@ -1374,10 +1523,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int iori,jori, inew,jnew; \ for (iori=0; iorin_allele; iori++) \ { \ - inew = ma->d[i][0].map[iori]; \ + inew = ma->buf[i].rec[irec].map[iori]; \ for (jori=0; jori<=iori; jori++) \ { \ - jnew = ma->d[i][0].map[jori]; \ + jnew = ma->buf[i].rec[irec].map[jori]; \ int kori = iori*(iori+1)/2 + jori; \ int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \ @@ -1414,7 +1563,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int iori,inew; \ for (iori=ifrom; iorin_allele; iori++) \ { \ - inew = ma->d[i][0].map[iori] - ifrom; \ + inew = ma->buf[i].rec[irec].map[iori] - ifrom; \ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \ if ( src_is_vector_end ) break; \ if ( src_is_missing ) tgt_set_missing; \ @@ -1463,9 +1612,9 @@ void merge_format(args_t *args, bcf1_t *out) int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index for (i=0; inreaders; i++) { - if ( !ma->has_line[i] ) continue; + bcf1_t *line = maux_get_line(args,i); + if ( !line ) continue; bcf_sr_t *reader = &files->readers[i]; - bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; for (j=0; jn_fmt; j++) { @@ -1497,9 +1646,10 @@ void merge_format(args_t *args, bcf1_t *out) ma->fmt_map[ifmt*files->nreaders+i] = fmt; } // Check if the allele numbering must be changed - for (j=1; jbuffer[0]->n_allele; j++) - if ( ma->d[i][0].map[j]!=j ) break; - ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1; + int irec = ma->buf[i].cur; + for (j=1; jn_allele; j++) + if ( ma->buf[i].rec[irec].map[j]!=j ) break; + ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1; } out->n_sample = bcf_hdr_nsamples(out_hdr); @@ -1507,203 +1657,383 @@ void merge_format(args_t *args, bcf1_t *out) merge_GT(args, ma->fmt_map, out); update_AN_AC(out_hdr, out); - if ( out->d.info!=ma->inf ) - { - // hacky, we rely on htslib internals: bcf_update_info() reallocated the info - ma->inf = out->d.info; - ma->minf = out->d.m_info; - } - for (i=1; i<=max_ifmt; i++) merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); out->d.indiv_dirty = 1; } -// The core merging function, one or none line from each reader -void merge_line(args_t *args) +void gvcf_set_alleles(args_t *args) +{ + int i,k; + bcf_srs_t *files = args->files; + maux_t *maux = args->maux; + gvcf_aux_t *gaux = maux->gvcf; + maux->nals = 0; + + for (i=0; inreaders; i++) + { + if ( !gaux[i].active ) continue; + bcf1_t *line = maux_get_line(args, i); + int irec = maux->buf[i].cur; + + hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map); + if ( !maux->nals ) // first record, copy the alleles to the output + { + maux->nals = line->n_allele; + hts_expand0(char*, maux->nals, maux->mals, maux->als); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + for (k=0; knals; k++) + { + if ( maux->als[k] ) free(maux->als[k]); + maux->als[k] = strdup(line->d.allele[k]); + maux->buf[i].rec[irec].map[k] = k; + } + } + else + { + maux->als = merge_alleles(line->d.allele, line->n_allele, maux->buf[i].rec[irec].map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) + { + bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); + error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1); + } + } + } +} + +/* + Output staged gVCF blocks, end is the last position of the block. Assuming + gaux[i].active flags are set and maux_get_line returns correct lines. +*/ +void gvcf_write_block(args_t *args, int start, int end) { + int i; + maux_t *maux = args->maux; + gvcf_aux_t *gaux = maux->gvcf; + assert(gaux); + + // Update POS + int min = INT_MAX; + char ref = 'N'; + for (i=0; ifiles->nreaders; i++) + { + if ( !gaux[i].active ) continue; + if ( ref=='N' && gaux[i].line->pos==start ) ref = gaux[i].line->d.allele[0][0]; + gaux[i].line->pos = start; + } + for (i=0; ifiles->nreaders; i++) + { + if ( !gaux[i].active ) continue; + if ( gaux[i].end < start ) + { + gaux[i].active = 0; + maux->buf[i].cur = -1; + continue; + } + gaux[i].line->d.allele[0][0] = ref; + if ( min > gaux[i].end ) min = gaux[i].end; + } + // Check for valid gVCF blocks in this region + if ( min==INT_MAX ) + { + assert(0); + maux->gvcf_min = 0; + return; + } + bcf1_t *out = args->out_line; - bcf_clear1(out); - out->unpacked = BCF_UN_ALL; + gvcf_set_alleles(args); + + // Merge the staged lines merge_chrom2qual(args, out); merge_filter(args, out); merge_info(args, out); merge_format(args, out); - bcf_write1(args->out_fh, args->out_hdr, out); -} + if ( args->gvcf_fai && out->d.allele[0][0]=='N' ) + { + int slen = 0; + char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen); + if (slen) + { + out->d.allele[0][0] = seq[0]; + free(seq); + } + } + // Update END boundary + if ( end > start ) + { + end++; + bcf_update_info_int32(args->out_hdr, out, "END", &end, 1); + } + else + bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); + bcf_write1(args->out_fh, args->out_hdr, out); + bcf_clear1(out); -void debug_buffers(FILE *fp, bcf_srs_t *files); -void debug_buffer(FILE *fp, bcf_sr_t *reader); -#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; } + // Inactivate blocks which do not extend beyond END and find new gvcf_min + min = INT_MAX; + for (i=0; ifiles->nreaders; i++) + { + if ( !gaux[i].active ) continue; + if ( gaux[i].end < end ) + { + gaux[i].active = 0; + maux->buf[i].cur = -1; + continue; + } + // next min END position bigger than the current one + if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1; + } + maux->gvcf_min = min==INT_MAX ? 0 : min; +} -// Clean the reader's buffer to and make it ready for the next next_line() call. -// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put -// the rest to the beggining. Then shorten the buffer so that the last element -// points to the last unfinished record. There are two special cases: the last -// line of the buffer typically has a different position and must stay at the -// end; next, the first record of the buffer must be one of those already -// printed, as it will be discarded by next_line(). -// -void shake_buffer(maux_t *maux, int ir, int pos) +/* + Flush staged gVCF blocks. Flush everything if there are no more lines + (done=1) or if there is a new chromosome. If still on the same chromosome, + all hanging blocks must be ended by creating new records: + A + 1 END=10 + B + 3 END=7 + C + 3 END=5 + out + 1 END=2 A . . + 3 END=5 A B C + 6 END=7 A B . + 8 END=10 A . . + +*/ +void gvcf_flush(args_t *args, int done) { - bcf_sr_t *reader = &maux->files->readers[ir]; - maux1_t *m = maux->d[ir]; - - if ( !reader->buffer ) return; - int i; - // FILE *fp = pysam_stdout; - // fprintf(fp," nbuf=%d\t", reader->nbuffer); for (i=0; inbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n"); - // debug_buffer(fp,reader); - // fprintf(fp,"--\n"); + maux_t *maux = args->maux; - int a = 1, b = reader->nbuffer; - if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards + if ( !maux->chr ) return; // first time here, nothing to flush - while ( abuffer[a], reader->buffer[b]); - SWAP(maux1_t, m[a], m[b]); - a++; - b--; - } + // Get current position and chromosome + for (i=0; in; i++) + if ( bcf_sr_has_line(maux->files,i) ) break; + bcf1_t *line = bcf_sr_get_line(maux->files,i); + bcf_hdr_t *hdr = bcf_sr_get_header(maux->files,i); - // position $a to the after the first unfinished record - while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++; + if ( !strcmp(maux->chr,bcf_seqname(hdr,line)) ) flush_until = line->pos; // still on the same chr + } - if ( anbuffer ) + // When called on a region, trim the blocks accordingly + int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; + if ( args->regs ) { - // there is a gap between the unfinished lines at the beggining and the - // last line. The last line must be brought forward to fill the gap - if ( reader->buffer[reader->nbuffer]->pos != pos ) + int rstart = -1, rend = -1; + if ( regidx_overlap(args->regs,maux->chr,start,flush_until,args->regs_itr) ) { - SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]); - SWAP(maux1_t, m[a], m[reader->nbuffer]); - reader->nbuffer = a; + // In case there are multiple regions, we treat them as one + rstart = args->regs_itr->beg; + while ( regitr_overlap(args->regs_itr) ) rend = args->regs_itr->end; } + if ( rstart > start ) start = rstart; + if ( rend < flush_until ) flush_until = rend+1; } - if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos ) + // output all finished blocks + while ( maux->gvcf_min && start < flush_until ) { - // the first record is unfinished, replace it with an empty line - // from the end of the buffer or else next_line will remove it - if ( reader->nbuffer + 1 >= maux->nbuf[ir] ) + // does the block end before the new line or is it interrupted? + int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until; + if ( start > tmp-1 ) break; + gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based + start = tmp; + } +} + +/* + Check incoming lines for new gVCF blocks, set pointer to the current source + buffer (gvcf or readers). In contrast to gvcf_flush, this function can be + called only after maux_reset as it relies on updated maux buffers. +*/ +void gvcf_stage(args_t *args, int pos) +{ + maux_t *maux = args->maux; + gvcf_aux_t *gaux = maux->gvcf; + bcf_srs_t *files = args->files; + int32_t *end = (int32_t*) maux->tmp_arr; + int i, nend = maux->ntmp_arr / sizeof(int32_t); + + maux->gvcf_break = -1; + maux->gvcf_min = INT_MAX; + for (i=0; inreaders; i++) + { + if ( gaux[i].active ) { - reader->nbuffer++; - maux_expand1(maux, ir); - reader->nbuffer--; - m = maux->d[ir]; + // gvcf block should not overlap with another record + if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; + maux->buf[i].beg = 0; + maux->buf[i].end = 1; + maux->buf[i].cur = 0; + continue; } - if ( reader->nbuffer+1 >= reader->mbuffer ) - error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer); - if ( reader->buffer[reader->nbuffer]->pos!=pos ) + // Does any of the lines have END set? It is enough to check only the + // first line, there should be no duplicate records with END in gVCF + + if ( maux->buf[i].beg==maux->buf[i].end ) continue; // no new record + + int irec = maux->buf[i].beg; + bcf_hdr_t *hdr = bcf_sr_get_header(files, i); + bcf1_t *line = args->files->readers[i].buffer[irec]; + int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); + if ( ret==1 ) { - // 4way swap - bcf1_t *tmp = reader->buffer[0]; - reader->buffer[0] = reader->buffer[reader->nbuffer+1]; - reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer]; - reader->buffer[reader->nbuffer] = tmp; - m[reader->nbuffer].skip = m[0].skip; - m[reader->nbuffer+1].skip = SKIP_DIFF; - reader->nbuffer++; + // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with + // an empty record: the gaux line must be kept until we reach its END. + gaux[i].active = 1; + gaux[i].end = end[0] - 1; + SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line); + gaux[i].line->pos = pos; + + maux->buf[i].lines = &gaux[i].line; + maux->buf[i].beg = 0; + maux->buf[i].end = 1; + maux->buf[i].cur = 0; + + // Set the rid,pos of the swapped line in the buffer or else the + // synced reader will have a problem with the next line + // + args->files->readers[i].buffer[irec]->rid = maux->buf[i].rid; + args->files->readers[i].buffer[irec]->pos = maux->pos; + + // Update block offsets + if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; } else - { - SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]); - SWAP(maux1_t, m[0], m[reader->nbuffer+1]); - } + maux->gvcf_break = line->pos; // must break the gvcf block } + maux->ntmp_arr = nend * sizeof(int32_t); + maux->tmp_arr = end; + if ( maux->gvcf_min==INT_MAX ) maux->gvcf_min = 0; +} + + +void debug_buffers(FILE *fp, bcf_srs_t *files); +void debug_buffer(FILE *fp, bcf_srs_t *files, int reader); + +/* + Flush all buffered and processed records with the same coordinate. + Note that synced reader discards buffer[0], so that needs to stay + untouched. +*/ +void clean_buffer(args_t *args) +{ + maux_t *ma = args->maux; + + int ir; + for (ir=0; irn; ir++) + { + // Invalidate pointer to reader's buffer or else gvcf_flush will attempt + // to use the old lines via maux_get_line() + if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; - // debug_buffer(fp,reader); - // fprintf(fp,"\t"); for (i=0; inbuffer; i++) fprintf(fp," %d", skip[i]); - // fprintf(fp,"\n\n"); + bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir); + if ( !reader->nbuffer ) continue; // nothing to clean - // set position of finished buffer[0] line to -1, otherwise swapping may - // bring it back after next_line() - reader->buffer[0]->pos = -1; + bcf1_t **buf = reader->buffer; + if ( buf[1]->rid!=ma->buf[ir].rid || buf[1]->pos!=ma->pos ) continue; // nothing to flush - // trim the buffer, remove finished lines from the end - i = reader->nbuffer; - while ( i>=1 && m[i--].skip&SKIP_DONE ) - reader->nbuffer--; + int a = 1, b = 2; + while ( b<=reader->nbuffer && buf[b]->rid==ma->buf[ir].rid && buf[b]->pos==ma->pos ) b++; + // b now points to the first line we want to preserve + while ( b<=reader->nbuffer ) + { + SWAP(bcf1_t*, buf[a], buf[b]); + a++; b++; + } + reader->nbuffer -= b-a; + } } -void debug_maux(args_t *args, int pos, int var_type) +void debug_maux(args_t *args) { bcf_srs_t *files = args->files; maux_t *maux = args->maux; int j,k,l; - fprintf(pysam_stderr,"Alleles to merge at %d\n", pos+1); + fprintf(pysam_stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals); for (j=0; jnreaders; j++) { bcf_sr_t *reader = &files->readers[j]; + buffer_t *buf = &maux->buf[j]; fprintf(pysam_stderr," reader %d: ", j); - for (k=0; k<=reader->nbuffer; k++) + for (k=buf->beg; kend; k++) { - if ( maux->d[j][k].skip==SKIP_DONE ) continue; + if ( buf->rec[k].skip & SKIP_DONE ) continue; bcf1_t *line = reader->buffer[k]; - if ( line->pos!=pos ) continue; fprintf(pysam_stderr,"\t"); - if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round + if ( buf->rec[k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round for (l=0; ln_allele; l++) fprintf(pysam_stderr,"%s%s", l==0?"":",", line->d.allele[l]); - if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"]"); + if ( buf->rec[k].skip ) fprintf(pysam_stderr,"]"); } fprintf(pysam_stderr,"\n"); } fprintf(pysam_stderr," counts: "); - for (j=0; jnals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysam_stderr,"\n"); - for (j=0; jnreaders; j++) - { - bcf_sr_t *reader = &files->readers[j]; - fprintf(pysam_stderr," out %d: ", j); - for (k=0; k<=reader->nbuffer; k++) - { - if ( maux->d[j][k].skip==SKIP_DONE ) continue; - bcf1_t *line = reader->buffer[k]; - if ( line->pos!=pos ) continue; - if ( maux->d[j][k].skip ) continue; - fprintf(pysam_stderr,"\t"); - for (l=0; ln_allele; l++) - fprintf(pysam_stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]); - } - fprintf(pysam_stderr,"\n"); - } - fprintf(pysam_stderr,"\n"); + for (j=0; jnals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); + fprintf(pysam_stderr,"\n\n"); } -// Determine which line should be merged from which reader: go through all -// readers and all buffered lines, expand REF,ALT and try to match lines with -// the same ALTs. A step towards output independent on input ordering of the -// lines. -void merge_buffer(args_t *args) + +/* + Determine which line should be merged from which reader: go through all + readers and all buffered lines, expand REF,ALT and try to match lines with + the same ALTs. + */ +int can_merge(args_t *args) { bcf_srs_t *files = args->files; - int i, pos = -1, var_type = 0; - char *id = NULL; + int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; maux_t *maux = args->maux; - maux_reset(maux); + gvcf_aux_t *gaux = maux->gvcf; + char *id = NULL, ref = 'N'; + maux->var_types = maux->nals = 0; - // set the current position + int i,j,k, ntodo = 0; for (i=0; inreaders; i++) { - if ( bcf_sr_has_line(files,i) ) + buffer_t *buf = &maux->buf[i]; + + if ( gaux && gaux[i].active ) { - bcf1_t *line = bcf_sr_get_line(files,i); - pos = line->pos; - var_type = bcf_get_variant_types(line); - id = line->d.id; - break; + // skip readers with active gvcf blocks + buf->rec[buf->beg].skip = SKIP_DIFF; + continue; + } + for (j=buf->beg; jend; j++) + { + if ( buf->rec[j].skip & SKIP_DONE ) continue; + + buf->rec[j].skip = SKIP_DIFF; + ntodo++; + + if ( args->merge_by_id ) + id = buf->lines[j]->d.id; + else + { + int var_type = bcf_get_variant_types(buf->lines[j]); + maux->var_types |= var_type ? var_type<<1 : 1; + } } + + // for gvcf: find out REF at this position + if ( buf->beg < buf->end && ref=='N' ) + ref = buf->lines[buf->beg]->d.allele[0][0]; } + if ( !ntodo ) return 0; // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this @@ -1712,19 +2042,24 @@ void merge_buffer(args_t *args) for (i=0; inreaders; i++) { bcf_sr_t *reader = &files->readers[i]; - if ( !reader->buffer ) continue; - int j, k; - for (j=0; j<=reader->nbuffer; j++) + buffer_t *buf = &maux->buf[i]; + + if ( gaux && gaux[i].active ) { - bcf1_t *line = reader->buffer[j]; + gaux[i].line->d.allele[0][0] = ref; + gaux[i].line->pos = maux->pos; + } + + for (j=buf->beg; jend; j++) + { + if ( buf->rec[j].skip & SKIP_DONE ) continue; + + bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer + int line_type = bcf_get_variant_types(line); + line_type = line_type ? line_type<<1 : 1; + // select relevant lines - maux->d[i][j].skip = SKIP_DIFF; - if ( pos!=line->pos ) - { - if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore - continue; - } if ( args->merge_by_id ) { if ( strcmp(id,line->d.id) ) continue; @@ -1735,30 +2070,30 @@ void merge_buffer(args_t *args) { // All alleles of the tested record must be present in the // selected maux record plus variant types must be the same - if ( var_type!=line->d.var_type ) continue; + if ( (maux->var_types & line_type) != line_type ) continue; if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible for (k=1; kn_allele; k++) { if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break; } - if ( k==line->n_allele ) continue; // no matching allele + if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele } if ( !(args->collapse&COLLAPSE_ANY) ) { - int compatible = 0; - if ( line_type==var_type ) compatible = 1; - else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything - else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1; - else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1; - else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1; - else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1; - else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1; - if ( !compatible ) continue; + // Merge: + // - SNPs+SNPs+MNPs+REF if -m both,snps + // - indels+indels+REF if -m both,indels, REF only if SNPs are not present + // - SNPs come first + if ( line_type & indel_mask ) + { + if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first + if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks + } } } - maux->d[i][j].skip = 0; + buf->rec[j].skip = 0; - hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map); + hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map); if ( !maux->nals ) // first record, copy the alleles to the output { maux->nals = line->n_allele; @@ -1766,111 +2101,118 @@ void merge_buffer(args_t *args) hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); for (k=0; knals; k++) { + free(maux->als[k]); maux->als[k] = strdup(line->d.allele[k]); - maux->d[i][j].map[k] = k; + buf->rec[j].map[k] = k; maux->cnt[k] = 1; } - pos = line->pos; continue; } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals); - if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname); + maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(args->out_hdr,line),line->pos+1,reader->fname); hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); for (k=1; kn_allele; k++) - maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files + maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files maux->cnt[0]++; } } + return 1; +} - // debug_maux(args, pos, var_type); +/* + Select records that have the same alleles; the input ordering of indels + must not matter. Multiple VCF lines can be emitted from this loop. + We expect only very few alleles and not many records with the same + position in the buffers, therefore the nested loops should not slow us + much. +*/ +void stage_line(args_t *args) +{ + int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; + bcf_srs_t *files = args->files; + maux_t *maux = args->maux; - // Select records that have the same alleles; the input ordering of indels - // must not matter. Multiple VCF lines can be emitted from this loop. - // We expect only very few alleles and not many records with the same - // position in the buffers, therefore the nested loops should not slow us - // much. - while (1) + // debug_maux(args); + + // take the most frequent allele present in multiple files, REF is skipped + int i,j,k,icnt = 1; + for (i=2; inals; i++) + if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; + + int nout = 0; + for (i=0; inreaders; i++) { - // take the most frequent allele present in multiple files - int icnt = 0; - for (i=1; inals; i++) - if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; - if ( maux->cnt[icnt]<0 ) break; + buffer_t *buf = &maux->buf[i]; + buf->cur = -1; + if ( buf->beg >= buf->end ) continue; // no lines in the buffer - int nmask = 0; - for (i=0; inreaders; i++) + // find lines with the same allele + for (j=buf->beg; jend; j++) { - maux->has_line[i] = 0; + if ( buf->rec[j].skip ) continue; // done or not compatible + if ( args->merge_by_id ) break; + if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record - bcf_sr_t *reader = &files->readers[i]; - if ( !reader->buffer ) continue; + for (k=0; klines[j]->n_allele; k++) + if ( icnt==buf->rec[j].map[k] ) break; - // find lines with the same allele - int j; - for (j=0; j<=reader->nbuffer; j++) - { - if ( maux->d[i][j].skip ) continue; - int k; - for (k=0; kbuffer[j]->n_allele; k++) - if ( icnt==maux->d[i][j].map[k] ) break; - if ( kbuffer[j]->n_allele ) break; - } - if ( j>reader->nbuffer ) - { - // no matching allele found in this file - if ( args->collapse==COLLAPSE_NONE ) continue; + if ( klines[j]->n_allele ) break; + } + if ( j>=buf->end ) + { + // no matching allele found in this file + if ( args->collapse==COLLAPSE_NONE ) continue; - for (j=0; j<=reader->nbuffer; j++) + for (j=buf->beg; jend; j++) + { + if ( buf->rec[j].skip ) continue; // done or not compatible + if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged + int line_type = bcf_get_variant_types(buf->lines[j]); + if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; + if ( line_type==VCF_REF ) { - if ( maux->d[i][j].skip ) continue; - if ( args->collapse&COLLAPSE_ANY ) break; - int line_type = bcf_get_variant_types(reader->buffer[j]); - if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - if ( line_type==VCF_REF ) - { - if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - } - else if ( var_type==VCF_REF ) - { - if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - } + if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ref_mask ) break; } - } - if ( j<=reader->nbuffer ) - { - // found a suitable line for merging, place it at the beggining - if ( j>0 ) + else if ( maux->var_types&ref_mask ) { - SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]); - SWAP(maux1_t, maux->d[i][0], maux->d[i][j]); + if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; } - // mark as finished so that it's ignored next time - maux->d[i][0].skip |= SKIP_DONE; - maux->has_line[i] = 1; - nmask++; } } - if ( !nmask ) break; // done, no more lines suitable for merging found - merge_line(args); // merge and output the line - maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished + if ( jend ) + { + // found a suitable line for merging + buf->cur = j; + + // mark as finished so that it's ignored next time + buf->rec[j].skip = SKIP_DONE; + nout++; + } } + assert( nout ); +} - // clean the alleles - for (i=0; inals; i++) +void merge_line(args_t *args) +{ + if ( args->regs ) { - free(maux->als[i]); - maux->als[i] = 0; + if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return; } - maux->nals = 0; - // get the buffers ready for the next next_line() call - for (i=0; inreaders; i++) - shake_buffer(maux, i, pos); + bcf1_t *out = args->out_line; + merge_chrom2qual(args, out); + merge_filter(args, out); + merge_info(args, out); + if ( args->do_gvcf ) + bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); + merge_format(args, out); + bcf_write1(args->out_fh, args->out_hdr, out); + bcf_clear1(out); } void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd) @@ -1889,6 +2231,8 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c else ksprintf(&str, " %s", argv[i]); } + kputs("; Date=", &str); + time_t tm; time(&tm); kputs(ctime(&tm), &str); kputc('\n', &str); bcf_hdr_append(hdr,str.s); free(str.s); @@ -1900,7 +2244,7 @@ void merge_vcf(args_t *args) { args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); if ( args->header_fname ) @@ -1930,14 +2274,32 @@ void merge_vcf(args_t *args) } if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init(); - args->maux = maux_init(args->files); + args->maux = maux_init(args); args->out_line = bcf_init1(); args->tmph = kh_init(strdict); - int ret; - while ( (ret=bcf_sr_next_line(args->files)) ) + + while ( bcf_sr_next_line(args->files) ) { - merge_buffer(args); + // output cached gVCF blocks which end before the new record + if ( args->do_gvcf ) + gvcf_flush(args,0); + + maux_reset(args->maux); + + // determine which of the new records are gvcf blocks + if ( args->do_gvcf ) + gvcf_stage(args, args->maux->pos); + + while ( can_merge(args) ) + { + stage_line(args); + merge_line(args); + } + clean_buffer(args); } + if ( args->do_gvcf ) + gvcf_flush(args,1); + info_rules_destroy(args); maux_destroy(args->maux); bcf_hdr_destroy(args->out_hdr); @@ -1960,7 +2322,10 @@ static void usage(void) fprintf(pysam_stderr, " --force-samples resolve duplicate sample names\n"); fprintf(pysam_stderr, " --print-header print only the merged header and exit\n"); fprintf(pysam_stderr, " --use-header use the provided header\n"); + fprintf(pysam_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n"); fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(pysam_stderr, " -F, --filter-logic remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); + fprintf(pysam_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); fprintf(pysam_stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(pysam_stderr, " -l, --file-list read file names from the file\n"); fprintf(pysam_stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); @@ -1991,7 +2356,9 @@ int main_vcfmerge(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"merge",required_argument,NULL,'m'}, + {"gvcf",required_argument,NULL,'g'}, {"file-list",required_argument,NULL,'l'}, + {"missing-to-ref",no_argument,NULL,'0'}, {"apply-filters",required_argument,NULL,'f'}, {"use-header",required_argument,NULL,1}, {"print-header",no_argument,NULL,2}, @@ -2003,10 +2370,25 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"info-rules",required_argument,NULL,'i'}, {"no-version",no_argument,NULL,8}, + {"filter-logic",required_argument,NULL,'F'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) { switch (c) { + case 'F': + if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; + else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; + else error("Filter logic not recognised: %s\n", optarg); + break; + case '0': args->missing_to_ref = 1; break; + case 'g': + args->do_gvcf = 1; + if ( strcmp("-",optarg) ) + { + args->gvcf_fai = fai_load(optarg); + if ( !args->gvcf_fai ) error("Failed to load the fai index: %s\n", optarg); + } + break; case 'l': args->file_list = optarg; break; case 'i': args->info_rules = optarg; break; case 'o': args->output_fname = optarg; break; @@ -2047,9 +2429,23 @@ int main_vcfmerge(int argc, char *argv[]) if ( argc-optind<2 && !args->file_list ) usage(); args->files->require_index = 1; - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->regions_list ) + { + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + if ( regions_is_file ) + args->regs = regidx_init(args->regions_list,NULL,NULL,sizeof(char*),NULL); + else + { + args->regs = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); + if ( regidx_insert_list(args->regs,args->regions_list,',') !=0 ) error("Could not parse the regions: %s\n", args->regions_list); + regidx_insert(args->regs,NULL); + } + if ( !args->regs ) error("Could not parse the regions: %s\n", args->regions_list); + args->regs_itr = regitr_init(args->regs); + } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); while (optindfiles, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); @@ -2067,6 +2463,9 @@ int main_vcfmerge(int argc, char *argv[]) } merge_vcf(args); bcf_sr_destroy(args->files); + if ( args->regs ) regidx_destroy(args->regs); + if ( args->regs_itr ) regitr_destroy(args->regs_itr); + if ( args->gvcf_fai ) fai_destroy(args->gvcf_fai); free(args); return 0; } diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 781833c..3a1706b 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -87,10 +88,21 @@ static inline int replace_iupac_codes(char *seq, int nseq) for (i=0; ihdr,line,gts,ngts); } -#define ERR_DUP_ALLELE -2 -#define ERR_REF_MISMATCH -1 -#define ERR_OK 0 -#define ERR_SYMBOLIC 1 +#define ERR_DUP_ALLELE -2 +#define ERR_REF_MISMATCH -1 +#define ERR_OK 0 +#define ERR_SYMBOLIC 1 +#define ERR_SPANNING_DELETION 2 static int realign(args_t *args, bcf1_t *line) { @@ -261,13 +274,17 @@ static int realign(args_t *args, bcf1_t *line) int i, nref, reflen = strlen(line->d.allele[0]); char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); - replace_iupac_codes(ref,nref); + replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N - // does REF contain non-standard bases? - if ( replace_iupac_codes(line->d.allele[0],reflen) ) + // does VCF REF contain non-standard bases? + if ( has_non_acgtn(line->d.allele[0],reflen) ) { - args->nchanged++; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + if ( args->check_ref==CHECK_REF_EXIT ) + error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); + if ( args->check_ref & CHECK_REF_WARN ) + fprintf(stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + free(ref); + return ERR_REF_MISMATCH; } if ( strcasecmp(ref,line->d.allele[0]) ) { @@ -289,6 +306,16 @@ static int realign(args_t *args, bcf1_t *line) for (i=0; in_allele; i++) { if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele + if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion + if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error + if ( has_non_acgtn(line->d.allele[i],0) ) + { + if ( args->check_ref==CHECK_REF_EXIT ) + error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); + if ( args->check_ref & CHECK_REF_WARN ) + fprintf(stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); + return ERR_REF_MISMATCH; + } als[i].l = 0; kputs(line->d.allele[i], &als[i]); @@ -390,18 +417,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \ if ( len==BCF_VL_A ) \ { \ - assert( ret==src->n_allele-1); \ + if ( ret!=src->n_allele-1 ) \ + error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \ bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ } \ else if ( len==BCF_VL_R ) \ { \ - assert( ret==src->n_allele); \ + if ( ret!=src->n_allele ) \ + error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ } \ else if ( len==BCF_VL_G ) \ { \ - assert( ret==src->n_allele*(src->n_allele+1)/2 ); \ + if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \ + error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ if ( ialt!=0 ) \ { \ vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ @@ -545,7 +578,9 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( len==BCF_VL_A ) \ { \ - assert( nvals==(src->n_allele-1)*nsmpl); \ + if ( nvals!=(src->n_allele-1)*nsmpl ) \ + error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ nvals /= nsmpl; \ type_t *src_vals = vals, *dst_vals = vals; \ for (i=0; in_allele*nsmpl); \ + if ( nvals!=src->n_allele*nsmpl ) \ + error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \ nvals /= nsmpl; \ type_t *src_vals = vals, *dst_vals = vals; \ for (i=0; in_allele*(src->n_allele+1)/2 || nfields==src->n_allele ); + if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele ) + error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n", + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); + int len = 0; if ( nfields==src->n_allele ) // haploid { @@ -994,7 +1034,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ else { int ial = bcf_gt_allele(gt2[k]); - assert( ialmaps[i].nals ); + if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial); gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); } } @@ -1583,7 +1623,8 @@ static void normalize_vcf(args_t *args) { htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out, args->n_threads); + if ( args->n_threads ) + hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); bcf_hdr_write(out, args->hdr); @@ -1666,7 +1707,7 @@ static void usage(void) fprintf(stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence\n"); + fprintf(stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); @@ -1677,7 +1718,7 @@ static void usage(void) fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(stderr, " --threads number of extra (de)compression threads [0]\n"); fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, "\n"); exit(1); @@ -1804,6 +1845,7 @@ int main_vcfnorm(int argc, char *argv[]) error("Failed to read the targets: %s\n", args->targets); } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n"); init_data(args); diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index 200ce79..da5a2aa 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -89,10 +90,21 @@ static inline int replace_iupac_codes(char *seq, int nseq) for (i=0; ihdr,line,gts,ngts); } -#define ERR_DUP_ALLELE -2 -#define ERR_REF_MISMATCH -1 -#define ERR_OK 0 -#define ERR_SYMBOLIC 1 +#define ERR_DUP_ALLELE -2 +#define ERR_REF_MISMATCH -1 +#define ERR_OK 0 +#define ERR_SYMBOLIC 1 +#define ERR_SPANNING_DELETION 2 static int realign(args_t *args, bcf1_t *line) { @@ -263,13 +276,17 @@ static int realign(args_t *args, bcf1_t *line) int i, nref, reflen = strlen(line->d.allele[0]); char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); - replace_iupac_codes(ref,nref); + replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N - // does REF contain non-standard bases? - if ( replace_iupac_codes(line->d.allele[0],reflen) ) + // does VCF REF contain non-standard bases? + if ( has_non_acgtn(line->d.allele[0],reflen) ) { - args->nchanged++; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + if ( args->check_ref==CHECK_REF_EXIT ) + error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); + if ( args->check_ref & CHECK_REF_WARN ) + fprintf(pysam_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + free(ref); + return ERR_REF_MISMATCH; } if ( strcasecmp(ref,line->d.allele[0]) ) { @@ -291,6 +308,16 @@ static int realign(args_t *args, bcf1_t *line) for (i=0; in_allele; i++) { if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele + if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion + if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error + if ( has_non_acgtn(line->d.allele[i],0) ) + { + if ( args->check_ref==CHECK_REF_EXIT ) + error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); + if ( args->check_ref & CHECK_REF_WARN ) + fprintf(pysam_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); + return ERR_REF_MISMATCH; + } als[i].l = 0; kputs(line->d.allele[i], &als[i]); @@ -392,18 +419,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \ if ( len==BCF_VL_A ) \ { \ - assert( ret==src->n_allele-1); \ + if ( ret!=src->n_allele-1 ) \ + error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \ bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ } \ else if ( len==BCF_VL_R ) \ { \ - assert( ret==src->n_allele); \ + if ( ret!=src->n_allele ) \ + error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ } \ else if ( len==BCF_VL_G ) \ { \ - assert( ret==src->n_allele*(src->n_allele+1)/2 ); \ + if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \ + error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ if ( ialt!=0 ) \ { \ vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ @@ -547,7 +580,9 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( len==BCF_VL_A ) \ { \ - assert( nvals==(src->n_allele-1)*nsmpl); \ + if ( nvals!=(src->n_allele-1)*nsmpl ) \ + error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ nvals /= nsmpl; \ type_t *src_vals = vals, *dst_vals = vals; \ for (i=0; in_allele*nsmpl); \ + if ( nvals!=src->n_allele*nsmpl ) \ + error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \ nvals /= nsmpl; \ type_t *src_vals = vals, *dst_vals = vals; \ for (i=0; in_allele*(src->n_allele+1)/2 || nfields==src->n_allele ); + if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele ) + error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n", + tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); + int len = 0; if ( nfields==src->n_allele ) // haploid { @@ -996,7 +1036,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ else { int ial = bcf_gt_allele(gt2[k]); - assert( ialmaps[i].nals ); + if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial); gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); } } @@ -1585,7 +1625,8 @@ static void normalize_vcf(args_t *args) { htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out, args->n_threads); + if ( args->n_threads ) + hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); bcf_hdr_write(out, args->hdr); @@ -1668,7 +1709,7 @@ static void usage(void) fprintf(pysam_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); fprintf(pysam_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); fprintf(pysam_stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); - fprintf(pysam_stderr, " -f, --fasta-ref reference sequence\n"); + fprintf(pysam_stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); fprintf(pysam_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); fprintf(pysam_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); @@ -1679,7 +1720,7 @@ static void usage(void) fprintf(pysam_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, " --threads number of extra (de)compression threads [0]\n"); fprintf(pysam_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(pysam_stderr, "\n"); exit(1); @@ -1806,6 +1847,7 @@ int main_vcfnorm(int argc, char *argv[]) error("Failed to read the targets: %s\n", args->targets); } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n"); init_data(args); diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index 87a773f..bfd6ad2 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -47,7 +48,7 @@ typedef struct _plugin_t plugin_t; * Plugin API: * ---------- * const char *about(void) - * - short description used by 'bcftools plugin -l' + * - short description used by 'bcftools plugin -lv' * * const char *usage(void) * - longer description used by 'bcftools +name -h' @@ -170,11 +171,11 @@ static void add_plugin_paths(args_t *args, const char *path) args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1)); args->plugin_paths[args->nplugin_paths] = dir; args->nplugin_paths++; - if ( args->verbose ) fprintf(stderr, "plugin directory %s .. ok\n", dir); + if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. ok\n", dir); } else { - if ( args->verbose ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); + if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); free(dir); } @@ -210,7 +211,7 @@ static void *dlopen_plugin(args_t *args, const char *fname) { tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname); handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though - if ( args->verbose ) + if ( args->verbose > 1 ) { if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); else fprintf(stderr,"%s:\n\tdlopen .. ok\n", tmp); @@ -221,7 +222,7 @@ static void *dlopen_plugin(args_t *args, const char *fname) } handle = dlopen(fname, RTLD_NOW); - if ( args->verbose ) + if ( args->verbose > 1 ) { if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); else fprintf(stderr,"%s:\n\tdlopen .. ok\n", fname); @@ -266,19 +267,19 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) plugin->init = NULL; else - if ( args->verbose ) fprintf(stderr,"\tinit .. ok\n"); + if ( args->verbose > 1 ) fprintf(stderr,"\tinit .. ok\n"); plugin->run = (dl_run_f) dlsym(plugin->handle, "run"); ret = dlerror(); if ( ret ) plugin->run = NULL; else - if ( args->verbose ) fprintf(stderr,"\trun .. ok\n"); + if ( args->verbose > 1 ) fprintf(stderr,"\trun .. ok\n"); if ( !plugin->init && !plugin->run ) { if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); - else if ( args->verbose ) fprintf(stderr,"\tinit/run .. not found\n"); + else if ( args->verbose > 1 ) fprintf(stderr,"\tinit/run .. not found\n"); return -1; } @@ -287,7 +288,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) { if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name); - else if ( args->verbose ) fprintf(stderr,"\tversion .. not found\n"); + else if ( args->verbose > 1 ) fprintf(stderr,"\tversion .. not found\n"); return -1; } @@ -392,8 +393,13 @@ static int list_plugins(args_t *args) qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name); for (i=0; iverbose ) + printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about()); + else + printf("%s\n", plugins[i].name); + } + if ( args->verbose ) printf("\n"); } else print_plugin_usage_hint(); @@ -460,12 +466,33 @@ static void usage(args_t *args) fprintf(stderr, "Plugin options:\n"); fprintf(stderr, " -h, --help list plugin's options\n"); fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(stderr, " -v, --verbose print debugging information on plugin failure\n"); + fprintf(stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); fprintf(stderr, " -V, --version print version string and exit\n"); fprintf(stderr, "\n"); exit(1); } +static int is_verbose(int argc, char *argv[]) +{ + int c, verbose = 0, opterr_ori = opterr; + static struct option loptions[] = + { + {"verbose",no_argument,NULL,'v'}, + {NULL,0,NULL,0} + }; + opterr = 0; + while ((c = getopt_long(argc, argv, "-v",loptions,NULL)) >= 0) + { + switch (c) { + case 'v': verbose++; break; + case 1: + default: break; + } + } + opterr = opterr_ori; + optind = 0; + return verbose; +} int main_plugin(int argc, char *argv[]) { int c; @@ -483,6 +510,7 @@ int main_plugin(int argc, char *argv[]) char *plugin_name = NULL; if ( argv[1][0]!='-' ) { + args->verbose = is_verbose(argc, argv); plugin_name = argv[1]; argc--; argv++; @@ -518,7 +546,7 @@ int main_plugin(int argc, char *argv[]) { switch (c) { case 'V': version_only = 1; break; - case 'v': args->verbose = 1; break; + case 'v': args->verbose++; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index 8365f7e..ec1d586 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -2,7 +2,7 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -49,7 +50,7 @@ typedef struct _plugin_t plugin_t; * Plugin API: * ---------- * const char *about(void) - * - short description used by 'bcftools plugin -l' + * - short description used by 'bcftools plugin -lv' * * const char *usage(void) * - longer description used by 'bcftools +name -h' @@ -172,11 +173,11 @@ static void add_plugin_paths(args_t *args, const char *path) args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1)); args->plugin_paths[args->nplugin_paths] = dir; args->nplugin_paths++; - if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir); + if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir); } else { - if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); + if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); free(dir); } @@ -212,7 +213,7 @@ static void *dlopen_plugin(args_t *args, const char *fname) { tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname); handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though - if ( args->verbose ) + if ( args->verbose > 1 ) { if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", tmp); @@ -223,7 +224,7 @@ static void *dlopen_plugin(args_t *args, const char *fname) } handle = dlopen(fname, RTLD_NOW); - if ( args->verbose ) + if ( args->verbose > 1 ) { if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", fname); @@ -268,19 +269,19 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) plugin->init = NULL; else - if ( args->verbose ) fprintf(pysam_stderr,"\tinit .. ok\n"); + if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit .. ok\n"); plugin->run = (dl_run_f) dlsym(plugin->handle, "run"); ret = dlerror(); if ( ret ) plugin->run = NULL; else - if ( args->verbose ) fprintf(pysam_stderr,"\trun .. ok\n"); + if ( args->verbose > 1 ) fprintf(pysam_stderr,"\trun .. ok\n"); if ( !plugin->init && !plugin->run ) { if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); - else if ( args->verbose ) fprintf(pysam_stderr,"\tinit/run .. not found\n"); + else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit/run .. not found\n"); return -1; } @@ -289,7 +290,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) { if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name); - else if ( args->verbose ) fprintf(pysam_stderr,"\tversion .. not found\n"); + else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tversion .. not found\n"); return -1; } @@ -394,8 +395,13 @@ static int list_plugins(args_t *args) qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name); for (i=0; iverbose ) + fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about()); + else + fprintf(pysam_stdout, "%s\n", plugins[i].name); + } + if ( args->verbose ) fprintf(pysam_stdout, "\n"); } else print_plugin_usage_hint(); @@ -462,12 +468,33 @@ static void usage(args_t *args) fprintf(pysam_stderr, "Plugin options:\n"); fprintf(pysam_stderr, " -h, --help list plugin's options\n"); fprintf(pysam_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(pysam_stderr, " -v, --verbose print debugging information on plugin failure\n"); + fprintf(pysam_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); fprintf(pysam_stderr, " -V, --version print version string and exit\n"); fprintf(pysam_stderr, "\n"); exit(1); } +static int is_verbose(int argc, char *argv[]) +{ + int c, verbose = 0, opterr_ori = opterr; + static struct option loptions[] = + { + {"verbose",no_argument,NULL,'v'}, + {NULL,0,NULL,0} + }; + opterr = 0; + while ((c = getopt_long(argc, argv, "-v",loptions,NULL)) >= 0) + { + switch (c) { + case 'v': verbose++; break; + case 1: + default: break; + } + } + opterr = opterr_ori; + optind = 0; + return verbose; +} int main_plugin(int argc, char *argv[]) { int c; @@ -485,6 +512,7 @@ int main_plugin(int argc, char *argv[]) char *plugin_name = NULL; if ( argv[1][0]!='-' ) { + args->verbose = is_verbose(argc, argv); plugin_name = argv[1]; argc--; argv++; @@ -520,7 +548,7 @@ int main_plugin(int argc, char *argv[]) { switch (c) { case 'V': version_only = 1; break; - case 'v': args->verbose = 1; break; + case 'v': args->verbose++; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c index 9560559..9437d7e 100644 --- a/bcftools/vcfroh.c +++ b/bcftools/vcfroh.c @@ -30,12 +30,19 @@ THE SOFTWARE. */ #include #include #include +#include +#include #include "bcftools.h" #include "HMM.h" +#include "smpl_ilist.h" #define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies #define STATE_AZ 1 // autozygous state +#define OUTPUT_ST (1<<1) +#define OUTPUT_RG (1<<2) +#define OUTPUT_GZ (1<<3) + /** Genetic map */ typedef struct { @@ -44,6 +51,24 @@ typedef struct } genmap_t; +/** HMM data for each sample */ +typedef struct +{ + double *eprob; // emission probs [2*nsites,msites] + uint32_t *sites; // positions [nsites,msites] + int nsites, msites; + int igenmap; // current position in genmap + int nused; // some stats to detect if things didn't go wrong + int nrid, *rid, *rid_off; // for viterbi training, keep all chromosomes + void *snapshot; // hmm snapshot + struct { + uint32_t beg,end,nqual; + double qual; + int rid, state; + } rg; +} +smpl_t; + typedef struct _args_t { bcf_srs_t *files; @@ -57,29 +82,32 @@ typedef struct _args_t double rec_rate; // constant recombination rate if > 0 hmm_t *hmm; - double *eprob; // emission probs [2*nsites,msites] - uint32_t *sites; // positions [nsites,msites] - int nsites, msites; + double baum_welch_th; int nrids, *rids, *rid_offs; // multiple chroms with vi_training + int nbuf_max, nbuf_olap; - int32_t *itmp; - int nitmp, mitmp; float *AFs; - int mAFs; + int32_t *itmp; + int mAFs, nitmp, mitmp, pl_hdr_id, gt_hdr_id; double pl2p[256], *pdg; int32_t skip_rid, prev_rid, prev_pos; - int ntot, nused; // some stats to detect if things didn't go awfully wrong - int ismpl, nsmpl; // index of query sample - char *estimate_AF, *sample; // list of samples for AF estimate and query sample - char **argv, *targets_list, *regions_list, *af_fname, *af_tag; - int argc, fake_PLs, snps_only, vi_training; + int ntot; // some stats to detect if things didn't go wrong + smpl_t *smpl; // HMM data for each sample + smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF) + smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file) + char *estimate_AF; // list of samples for AF estimate and query sample + int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT + char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname; + int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads; + BGZF *out; + kstring_t str; } args_t; void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob); -void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob); +void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob); void *smalloc(size_t size) { @@ -90,57 +118,137 @@ void *smalloc(size_t size) static void init_data(args_t *args) { + int i; + args->prev_rid = args->skip_rid = -1; args->hdr = args->files->readers[0].header; - if ( !args->sample ) - { - if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n"); - args->sample = strdup(args->hdr->samples[0]); - } if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n"); - // Set samples - kstring_t str = {0,0,0}; - if ( args->estimate_AF && strcmp("-",args->estimate_AF) ) + if ( !args->fake_PLs ) { - int i, n; - char **smpls = hts_readlist(args->estimate_AF, 1, &n); + args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) + error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n"); + if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) + error("Error: The FORMAT/PL tag not defined as Integer in the header\n"); + } - // Make sure the query sample is included - for (i=0; isample,smpls[i]) ) break; + if ( args->estimate_AF ) + { + if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3; + else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; } + if ( strcmp("-",args->estimate_AF) ) + args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); + } - // Add the query sample if not present - if ( i!=n ) kputs(args->sample, &str); + if ( args->estimate_AF || args->fake_PLs ) + { + if ( args->af_from_PL ) + { + args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) + error("Error: The FORMAT/PL tag not found in the header\n"); + } + else + { + args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) ) + error("Error: The FORMAT/GT tag not found in the header\n"); + } + } + if ( args->fake_PLs ) + { + args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) ) + error("Error: The FORMAT/GT tag not found in the header\n"); + } - for (i=0; iroh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); + if ( args->samples ) + { + // we may be able to subset to a few samples, for a text VCF this can be a major speedup + if ( (bcf_sr_get_reader(args->files,0))->file->format.format==vcf ) { - if ( str.l ) kputc(',', &str); - kputs(smpls[i], &str); - free(smpls[i]); + kstring_t str = {0,0,0}; + smpl_ilist_t *tmp = args->roh_smpl, *rmme = NULL; + if ( args->af_smpl ) + { + for (i=0; iroh_smpl->n; i++) + { + if ( str.l ) kputc(',', &str); + kputs(args->hdr->samples[args->roh_smpl->idx[i]], &str); + } + for (i=0; iaf_smpl->n; i++) + { + kputc(',', &str); + kputs(args->hdr->samples[args->af_smpl->idx[i]], &str); + } + rmme = tmp = smpl_ilist_init(args->hdr, str.s, 0, SMPL_NONE); + } + if ( tmp->n < bcf_hdr_nsamples(args->hdr) ) + { + str.l = 0; + for (i=0; in; i++) + { + if ( str.l ) kputc(',', &str); + kputs(args->hdr->samples[tmp->idx[i]], &str); + } + int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); + if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); + else if ( ret>0 ) error("The %d-th sample not found in the VCF: %s\n", ret,str.s); + + // update sample ids + smpl_ilist_destroy(args->roh_smpl); + args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); + + if ( args->af_smpl ) + { + smpl_ilist_destroy(args->af_smpl); + args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); + } + } + free(str.s); + if ( rmme ) + smpl_ilist_destroy(rmme); } - free(smpls); } - else if ( !args->estimate_AF ) - kputs(args->sample, &str); - if ( str.l ) + // check whether all samples are in this list. If so, the lookup will not be needed + if ( args->af_smpl && args->af_smpl->n == bcf_hdr_nsamples(args->hdr) ) { - int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); - if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); - else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); + // all samples are in this list + smpl_ilist_destroy(args->af_smpl); + args->af_smpl = NULL; } - if ( args->af_tag ) - if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) ) - error("No such INFO tag in the VCF: %s\n", args->af_tag); + if ( args->buffer_size ) + { + args->nbuf_olap = -1; + char *end; + double tmp = strtod(args->buffer_size,&end); + if ( *end ) + { + if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size); + args->nbuf_olap = strtol(end+1,&end,10); + if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size); + } + if ( tmp<0 ) + args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n; + else + args->nbuf_max = tmp; - args->nsmpl = bcf_hdr_nsamples(args->hdr); - args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample); - free(str.s); + if ( args->nbuf_olap<0 ) + args->nbuf_olap = args->nbuf_max*0.01; + } + fprintf(stderr,"Number of target samples: %d\n", args->roh_smpl->n); + fprintf(stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0)); + fprintf(stderr,"Number of sites in the buffer/overlap: "); + if ( args->nbuf_max ) fprintf(stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap); + else fprintf(stderr,"unlimited\n"); + + args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t)); - int i; for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.); // Init transition matrix and HMM @@ -150,40 +258,88 @@ static void init_data(args_t *args) MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + args->hmm = hmm_init(2, tprob, 10000); if ( args->genmap_fname ) - { - args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); - } else if ( args->rec_rate > 0 ) - { - args->hmm = hmm_init(2, tprob, 0); - hmm_set_tprob_func(args->hmm, set_tprob_recrate, args); + hmm_set_tprob_func(args->hmm, set_tprob_rrate, args); - } - else - args->hmm = hmm_init(2, tprob, 10000); + args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); + if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno)); // print header - printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); - printf("# The command line was:\tbcftools %s", args->argv[0]); + args->str.l = 0; + ksprintf(&args->str, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); + ksprintf(&args->str, "# The command line was:\tbcftools %s", args->argv[0]); for (i=1; iargc; i++) - printf(" %s",args->argv[i]); - printf("\n#\n"); - printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); + ksprintf(&args->str, " %s",args->argv[i]); + ksprintf(&args->str, "\n#\n"); + if ( args->output_type & OUTPUT_RG ) + { + i = 2; + ksprintf(&args->str, "# RG"); + ksprintf(&args->str, "\t[%d]Sample", i++); + ksprintf(&args->str, "\t[%d]Chromosome", i++); + ksprintf(&args->str, "\t[%d]Start", i++); + ksprintf(&args->str, "\t[%d]End", i++); + ksprintf(&args->str, "\t[%d]Length (bp)", i++); + ksprintf(&args->str, "\t[%d]Number of markers", i++); + ksprintf(&args->str, "\t[%d]Quality (average fwd-bwd phred score)", i++); + ksprintf(&args->str, "\n"); + } + if ( args->output_type & OUTPUT_ST ) + { + i = 2; + ksprintf(&args->str, "# ST"); + ksprintf(&args->str, "\t[%d]Sample", i++); + ksprintf(&args->str, "\t[%d]Chromosome", i++); + ksprintf(&args->str, "\t[%d]Position", i++); + ksprintf(&args->str, "\t[%d]State (0:HW, 1:AZ)", i++); + ksprintf(&args->str, "\t[%d]Quality (fwd-bwd phred score)", i++); + ksprintf(&args->str, "\n"); + } + if ( args->vi_training) + { + i = 2; + ksprintf(&args->str, "# VT, Viterbi Training"); + ksprintf(&args->str, "\t[%d]Sample", i++); + ksprintf(&args->str, "\t[%d]Iteration", i++); + ksprintf(&args->str, "\t[%d]dAZ", i++); + ksprintf(&args->str, "\t[%d]dHW", i++); + ksprintf(&args->str, "\t[%d]1 - P(HW|HW)", i++); + ksprintf(&args->str, "\t[%d]P(AZ|HW)", i++); + ksprintf(&args->str, "\t[%d]1 - P(AZ|AZ)", i++); + ksprintf(&args->str, "\t[%d]P(HW|AZ)", i++); + ksprintf(&args->str, "\n"); + } + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) + error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } static void destroy_data(args_t *args) { - free(args->sites); - free(args->eprob); - free(args->sample); + if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname); + int i; + for (i=0; iroh_smpl->n; i++) + { + free(args->smpl[i].eprob); + free(args->smpl[i].sites); + free(args->smpl[i].rid); + free(args->smpl[i].rid_off); + free(args->smpl[i].snapshot); + } + free(args->str.s); + free(args->smpl); + if ( args->af_smpl ) smpl_ilist_destroy(args->af_smpl); + smpl_ilist_destroy(args->roh_smpl); free(args->rids); free(args->rid_offs); hmm_destroy(args->hmm); bcf_sr_destroy(args->files); - free(args->itmp); free(args->AFs); free(args->pdg); + free(args->AFs); free(args->pdg); free(args->genmap); + free(args->itmp); + free(args->samples); } static int load_genmap(args_t *args, bcf1_t *line) @@ -220,21 +376,22 @@ static int load_genmap(args_t *args, bcf1_t *line) hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap); genmap_t *gm = &args->genmap[args->ngenmap-1]; + // position, convert to 0-based char *tmp, *end; gm->pos = strtol(str.s, &tmp, 10); if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s); + gm->pos -= 1; // skip second column tmp++; while ( *tmp && !isspace(*tmp) ) tmp++; - // read the genetic map in cM + // read the genetic map in cM, scale from % to likelihood gm->rate = strtod(tmp+1, &end); if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s); + gm->rate *= 0.01; } if ( !args->ngenmap ) error("Genetic map empty?\n"); - int i; - for (i=0; ingenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1 if ( hts_close(fp) ) error("Close failed\n"); free(str.s); return 0; @@ -255,7 +412,6 @@ static double get_genmap_rate(args_t *args, int start, int end) // position j to be equal or larger than end int j = i; while ( j+1ngenmap && args->genmap[j].pos < end ) j++; - if ( i==j ) { args->igenmap = i; @@ -272,17 +428,20 @@ static double get_genmap_rate(args_t *args, int start, int end) void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob) { args_t *args = (args_t*) data; - double ci = get_genmap_rate(args, pos - prev_pos, pos); + double ci = get_genmap_rate(args, prev_pos, pos); + if ( args->rec_rate ) ci *= args->rec_rate; + if ( ci > 1 ) ci = 1; MAT(tprob,2,STATE_HW,STATE_AZ) *= ci; MAT(tprob,2,STATE_AZ,STATE_HW) *= ci; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ); MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW); } -void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob) +void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob) { args_t *args = (args_t*) data; double ci = (pos - prev_pos) * args->rec_rate; + if ( ci > 1 ) ci = 1; MAT(tprob,2,STATE_HW,STATE_AZ) *= ci; MAT(tprob,2,STATE_AZ,STATE_HW) *= ci; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ); @@ -315,132 +474,163 @@ void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, * */ -static void flush_viterbi(args_t *args) +static void flush_viterbi(args_t *args, int ismpl) { - int i,j; + smpl_t *smpl = &args->smpl[ismpl]; + if ( !smpl->nsites ) return; - if ( !args->nsites ) return; + const char *name = args->hdr->samples[ args->roh_smpl->idx[ismpl] ]; - if ( !args->vi_training ) + int i,j,k; + + if ( !args->vi_training ) // single viterbi pass { - // single viterbi pass, one chromsome - hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites); - hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites); + hmm_restore(args->hmm, smpl->snapshot); + int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites; + if ( end < smpl->nsites ) + smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1); + + args->igenmap = smpl->igenmap; + hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites); + hmm_run_fwd_bwd(args->hmm, smpl->nsites, smpl->eprob, smpl->sites); double *fwd = hmm_get_fwd_bwd_prob(args->hmm); - const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); - uint8_t *vpath = hmm_get_viterbi_path(args->hmm); - for (i=0; insites; i++) + const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); + uint8_t *vpath = hmm_get_viterbi_path(args->hmm); + + for (i=0; isites[i]+1, state, phred_score(1.0-pval[state])); - } - return; - } + double qual = phred_score(1.0 - fwd[i*2 + state]); + if ( args->output_type & OUTPUT_ST ) + { + args->str.l = 0; + ksprintf(&args->str, "ST\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[i]+1, state, qual); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); + } - // viterbi training, multiple chromosomes - double t2az_prev, t2hw_prev; - double deltaz, delthw; - int niter = 0; - do - { - double *tprob_arr = hmm_get_tprob(args->hmm); - t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ; - t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW; - double tcounts[] = { 0,0,0,0 }; - for (i=0; inrids; i++) - { - // run viterbi for each chromosomes. eprob and sites contain - // multiple chromosomes, rid_offs mark the boundaries - int ioff = args->rid_offs[i]; - int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; - hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); - - // what transitions were observed: add to the total counts - uint8_t *vpath = hmm_get_viterbi_path(args->hmm); - for (j=1; joutput_type & OUTPUT_RG ) { - // count the number of transitions - int prev_state = vpath[2*(j-1)]; - int curr_state = vpath[2*j]; - MAT(tcounts,2,curr_state,prev_state) += 1; + if ( state!=smpl->rg.state ) + { + if ( !state ) // the region ends, flush + { + args->str.l = 0; + ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid), + smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); + smpl->rg.state = 0; + } + else + { + smpl->rg.state = 1; + smpl->rg.beg = smpl->sites[i]; + smpl->rg.rid = args->prev_rid; + } + } + else if ( state ) + { + smpl->rg.nqual++; + smpl->rg.qual += qual; + smpl->rg.end = smpl->sites[i]; + } } } - // update the transition matrix - int n = 1; - for (i=0; i<2; i++) + if ( end < smpl->nsites ) { - for (j=0; j<2; j++) n += MAT(tcounts,2,i,j); + end = smpl->nsites - args->nbuf_olap; + memmove(smpl->sites, smpl->sites + end, sizeof(*smpl->sites)*args->nbuf_olap); + memmove(smpl->eprob, smpl->eprob + end*2, sizeof(*smpl->eprob)*args->nbuf_olap*2); + smpl->nsites = args->nbuf_olap; + smpl->igenmap = args->igenmap; } - for (i=0; i<2; i++) + else { - for (j=0; j<2; j++) + smpl->nsites = 0; + smpl->igenmap = 0; + + if ( smpl->rg.state ) { - // no transition to i-th state was observed, set to a small number - if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n; - else MAT(tcounts,2,i,j) /= n; + args->str.l = 0; + ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid), + smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); + smpl->rg.state = 0; } } - // normalize - for (i=0; i<2; i++) + return; + } + + + // viterbi training, multiple chromosomes + double t2az_prev, t2hw_prev; + double deltaz, delthw; + + double *tprob_arr = hmm_get_tprob(args->hmm); + MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; + MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW; + MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ; + MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + hmm_set_tprob(args->hmm, tprob_arr, 10000); + + int niter = 0; + do + { + tprob_arr = hmm_get_tprob(args->hmm); + t2az_prev = MAT(tprob_arr,2,STATE_AZ,STATE_HW); //args->t2AZ; + t2hw_prev = MAT(tprob_arr,2,STATE_HW,STATE_AZ); //args->t2HW; + double tprob_new[] = { 0,0,0,0 }; + for (i=0; inrid; i++) { - double norm = 0; - for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i); - assert( norm!=0 ); - for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm; + int ioff = smpl->rid_off[i]; + int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff; + args->igenmap = 0; + tprob_arr = hmm_run_baum_welch(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff); + for (j=0; j<2; j++) + for (k=0; k<2; k++) MAT(tprob_new,2,j,k) += MAT(tprob_arr,2,j,k); } + for (j=0; j<2; j++) + for (k=0; k<2; k++) MAT(tprob_new,2,j,k) /= smpl->nrid; - if ( args->genmap_fname || args->rec_rate > 0 ) - hmm_set_tprob(args->hmm, tcounts, 0); - else - hmm_set_tprob(args->hmm, tcounts, 10000); + hmm_set_tprob(args->hmm, tprob_new, 10000); - tprob_arr = hmm_get_tprob(args->hmm); - deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev); - delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev); + deltaz = fabs(MAT(tprob_new,2,STATE_AZ,STATE_HW)-t2az_prev); + delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev); niter++; - fprintf(stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", - niter,deltaz,delthw, - MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), - MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); + args->str.l = 0; + ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", + name,niter,deltaz,delthw, + 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW), + 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ)); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } - while ( deltaz > 0.0 || delthw > 0.0 ); - double *tprob_arr = hmm_get_tprob(args->hmm); - fprintf(stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter, - MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), - MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); + while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th ); // output the results - for (i=0; inrids; i++) + for (i=0; inrid; i++) { - int ioff = args->rid_offs[i]; - int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; - hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); - hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); + int ioff = smpl->rid_off[i]; + int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff; + args->igenmap = 0; + hmm_run_viterbi(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff); + hmm_run_fwd_bwd(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); double *fwd = hmm_get_fwd_bwd_prob(args->hmm); - const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]); + const char *chr = bcf_hdr_id2name(args->hdr,smpl->rid[i]); for (j=0; jsites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval); + int state = vpath[j*2]==STATE_AZ ? 1 : 0; + double *pval = fwd + j*2; + args->str.l = 0; + ksprintf(&args->str, "ROH\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[ioff+j]+1, state, phred_score(1.0-pval[state])); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } } } -static void push_rid(args_t *args, int rid) -{ - args->nrids++; - args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int)); - args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int)); - args->rids[ args->nrids-1 ] = rid; - args->rid_offs[ args->nrids-1 ] = args->nsites; -} int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { @@ -468,27 +658,52 @@ int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) return 0; } -int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq) +int8_t *get_GT(args_t *args, bcf1_t *line) { - if ( !args->nitmp ) - { - args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); - if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? - args->nitmp /= args->nsmpl; - } + int i; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==args->gt_hdr_id ) break; + if ( i==line->n_fmt ) return NULL; // the tag is not present in this record + + bcf_fmt_t *fmt = &line->d.fmt[i]; + if ( fmt->n!=2 ) return NULL; // not diploid + if ( fmt->type!=BCF_BT_INT8 ) error("This is unexpected, GT type is %d\n", fmt->type); + return (int8_t*) fmt->p; +} + +int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq) +{ int i, nalt = 0, nref = 0; - for (i=0; insmpl; i++) + if ( args->af_smpl ) // subset samples for AF estimate { - int32_t *gt = &args->itmp[i*args->nitmp]; + for (i=0; iaf_smpl->n; i++) + { + int ismpl = args->af_smpl->idx[i]; + if ( bcf_gt_is_missing(gt[2*ismpl]) || bcf_gt_is_missing(gt[2*ismpl+1]) ) continue; - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + if ( bcf_gt_allele(gt[2*ismpl]) ) nalt++; + else nref++; - if ( bcf_gt_allele(gt[0]) ) nalt++; - else nref++; + if ( bcf_gt_allele(gt[2*ismpl+1]) ) nalt++; + else nref++; + } + } + else // all samples used in AF estimate + { + int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr); + while ( gt < end ) + { + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + + if ( bcf_gt_allele(gt[0]) ) nalt++; + else nref++; + + if ( bcf_gt_allele(gt[1]) ) nalt++; + else nref++; - if ( bcf_gt_allele(gt[1]) ) nalt++; - else nref++; + gt += 2; + } } if ( !nalt && !nref ) return -1; @@ -496,105 +711,249 @@ int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq) return 0; } +int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_freq) +{ + double af = 0; + int i, j, naf = 0; + + int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); + if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields + + if ( args->af_smpl ) // subset samples for AF estimate + { + #define BRANCH(type_t) \ + { \ + for (i=0; iaf_smpl->n; i++) \ + { \ + int ismpl = args->af_smpl->idx[i]; \ + type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ + if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ + if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ + double prob[3], norm = 0; \ + prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ + prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ + prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ + for (j=0; j<3; j++) norm += prob[j]; \ + for (j=0; j<3; j++) prob[j] /= norm; \ + af += 0.5*prob[1] + prob[2]; \ + naf++; \ + } \ + } + switch (fmt_pl->type) { + case BCF_BT_INT8: BRANCH(int8_t); break; + case BCF_BT_INT16: BRANCH(int16_t); break; + case BCF_BT_INT32: BRANCH(int32_t); break; + default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + } + #undef BRANCH + } + else // all samples used in AF estimate + { + int nsmpl = bcf_hdr_nsamples(args->hdr); + #define BRANCH(type_t) \ + { \ + type_t *p = (type_t*)fmt_pl->p; \ + p -= fmt_pl->n; \ + for (i=0; in; \ + if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ + if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ + double prob[3], norm = 0; \ + prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ + prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ + prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ + for (j=0; j<3; j++) norm += prob[j]; \ + for (j=0; j<3; j++) prob[j] /= norm; \ + af += 0.5*prob[1] + prob[2]; \ + naf++; \ + } \ + } + switch (fmt_pl->type) { + case BCF_BT_INT8: BRANCH(int8_t); break; + case BCF_BT_INT16: BRANCH(int16_t); break; + case BCF_BT_INT32: BRANCH(int32_t); break; + default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + } + #undef BRANCH + } + if ( !naf ) return -1; + + *alt_freq = af / naf; + return 0; +} + +bcf_fmt_t *get_PL(args_t *args, bcf1_t *line) +{ + int i; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==args->pl_hdr_id ) return &line->d.fmt[i]; + return NULL; +} -int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) +int process_line(args_t *args, bcf1_t *line, int ial) { - args->nitmp = 0; + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + + double alt_freq; + int8_t *GTs = NULL; + bcf_fmt_t *fmt_pl = NULL; // Set allele frequency - int ret; + int ret = 0, i,j; if ( args->af_tag ) { // Use an INFO tag provided by the user ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs); - if ( ret==1 ) - *alt_freq = args->AFs[0]; + if ( ret>0 ) + alt_freq = args->AFs[ial-1]; if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); } else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args->files->targets, line, alt_freq); + ret = read_AF(args->files->targets, line, &alt_freq); + } + else if ( args->dflt_AF > 0 ) + { + alt_freq = args->dflt_AF; + } + else if ( args->estimate_AF ) + { + // Estimate AF from GTs or PLs of all samples or samples listed in a file + if ( args->af_from_PL ) + { + fmt_pl = get_PL(args, line); + if ( !fmt_pl ) return -1; + ret = estimate_AF_from_PL(args, fmt_pl, ial, &alt_freq); + } + else + { + GTs = get_GT(args, line); + if ( !GTs ) return -1; + ret = estimate_AF_from_GT(args, GTs, &alt_freq); + } } else { - // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF - ret = -1; - if ( !args->estimate_AF ) + // Use AC/AN + int AC = -1, AN = 0; + ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); + if ( ret==1 ) { - int AC = -1, AN = 0; - ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); - if ( ret==1 ) - { - AN = args->itmp[0]; - ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); - if ( ret>0 ) - AC = args->itmp[0]; - } - if ( AN<=0 || AC<0 ) - ret = -1; - else - *alt_freq = (double) AC/AN; + AN = args->itmp[0]; + ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); + if ( ret>0 ) + AC = args->itmp[0]; } - if ( ret==-1 ) - ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp + if ( AN<=0 || AC<0 ) + ret = -1; + else + alt_freq = (double) AC/AN; } if ( ret<0 ) return ret; - if ( *alt_freq==0.0 ) - { - if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 - *alt_freq = args->dflt_AF; - } + if ( alt_freq==0.0 ) return -1; - // Set P(D|G) + int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); if ( args->fake_PLs ) { - if ( !args->nitmp ) - { - args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); - if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? - args->nitmp /= args->nsmpl; - } + if ( !GTs ) GTs = get_GT(args, line); + } + else + { + fmt_pl = get_PL(args, line); + if ( !fmt_pl ) return -1; + if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields + } - int32_t *gt = &args->itmp[args->ismpl*args->nitmp]; - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1; + for (i=0; iroh_smpl->n; i++) + { + int ismpl = args->roh_smpl->idx[i]; - int a = bcf_gt_allele(gt[0]); - int b = bcf_gt_allele(gt[1]); - if ( a!=b ) - { - pdg[0] = pdg[2] = args->unseen_PL; - pdg[1] = 1 - 2*args->unseen_PL; - } - else if ( a==0 ) + // set P(D|G) + double pdg[3]; + if ( args->fake_PLs ) { - pdg[0] = 1 - 2*args->unseen_PL; - pdg[1] = pdg[2] = args->unseen_PL; + int8_t *gt = GTs + 2*ismpl; + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + + int a = bcf_gt_allele(gt[0]); + int b = bcf_gt_allele(gt[1]); + if ( a!=b ) + { + pdg[0] = pdg[2] = args->unseen_PL; + pdg[1] = 1 - 2*args->unseen_PL; + } + else if ( a==0 ) + { + pdg[0] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL; + pdg[1] = args->unseen_PL; + pdg[2] = args->unseen_PL*args->unseen_PL; + } + else + { + pdg[0] = args->unseen_PL*args->unseen_PL; + pdg[1] = args->unseen_PL; + pdg[2] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL; + } } else { - pdg[0] = pdg[1] = args->unseen_PL; - pdg[2] = 1 - 2*args->unseen_PL; + #define BRANCH(type_t) \ + { \ + type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ + if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ + if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ + pdg[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ + pdg[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ + pdg[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ + } + switch (fmt_pl->type) { + case BCF_BT_INT8: BRANCH(int8_t); break; + case BCF_BT_INT16: BRANCH(int16_t); break; + case BCF_BT_INT32: BRANCH(int32_t); break; + default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + } + #undef BRANCH } - } - else - { - args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp); - if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid? - args->nitmp /= args->nsmpl; - - int32_t *pl = &args->itmp[args->ismpl*args->nitmp]; - pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0; - pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0; - pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0; double sum = pdg[0] + pdg[1] + pdg[2]; - if ( !sum ) return -1; - pdg[0] /= sum; - pdg[1] /= sum; - pdg[2] /= sum; + if ( !sum ) continue; + for (j=0; j<3; j++) pdg[j] /= sum; + if ( args->skip_homref && pdg[0]>0.99 ) continue; + + smpl_t *smpl = &args->smpl[i]; + smpl->nused++; + + if ( smpl->nsites >= smpl->msites ) + { + hts_expand(uint32_t,smpl->nsites+1,smpl->msites,smpl->sites); + smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2); + if ( !smpl->eprob ) error("Error: failed to alloc %d bytes\n", sizeof(*smpl->eprob)*smpl->msites*2); + } + + // Calculate emission probabilities P(D|AZ) and P(D|HW) + double *eprob = &smpl->eprob[2*smpl->nsites]; + eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; + eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; + + smpl->sites[smpl->nsites] = line->pos; + smpl->nsites++; + + if ( args->vi_training ) + { + if ( !smpl->nrid || line->rid!=smpl->rid[smpl->nrid-1] ) + { + smpl->nrid++; + smpl->rid = (int*) realloc(smpl->rid,sizeof(*smpl->rid)*smpl->nrid); + smpl->rid[smpl->nrid-1] = line->rid; + smpl->rid_off = (int*) realloc(smpl->rid_off,sizeof(*smpl->rid_off)*smpl->nrid); + smpl->rid_off[smpl->nrid-1] = smpl->nsites - 1; + } + } + else if ( args->nbuf_max && smpl->nsites >= args->nbuf_max ) flush_viterbi(args, i); } return 0; @@ -602,18 +961,35 @@ int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) static void vcfroh(args_t *args, bcf1_t *line) { + int i; + // Are we done? if ( !line ) { - flush_viterbi(args); + for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); return; } args->ntot++; - // Skip unwanted lines + // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; if ( line->n_allele==1 ) return; // no ALT allele - if ( line->n_allele!=2 ) return; // only biallelic sites + if ( line->n_allele > 3 ) return; // cannot be bi-allelic, even with <*> + + // This can be raw callable VCF with the symbolic unseen allele <*> + int ial = 0; + for (i=1; in_allele; i++) + if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; } + if ( ial==0 ) // normal VCF, the symbolic allele is not present + { + if ( line->n_allele!=2 ) return; // not biallelic + ial = 1; + } + else + { + if ( line->n_allele!=3 ) return; // not biallelic + ial = ial==1 ? 2 : 1; // <*> can come in any order + } if ( args->snps_only && !bcf_is_snp(line) ) return; // Initialize genetic map @@ -623,21 +999,15 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_rid = line->rid; args->prev_pos = line->pos; skip_rid = load_genmap(args, line); - if ( !skip_rid && args->vi_training ) push_rid(args, line->rid); } // New chromosome? if ( args->prev_rid!=line->rid ) { skip_rid = load_genmap(args, line); - if ( args->vi_training ) - { - if ( !skip_rid ) push_rid(args, line->rid); - } - else + if ( !args->vi_training ) { - flush_viterbi(args); - args->nsites = 0; + for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); } args->prev_rid = line->rid; args->prev_pos = line->pos; @@ -655,25 +1025,8 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_pos = line->pos; - // Ready for the new site - int m = args->msites; - hts_expand(uint32_t,args->nsites+1,args->msites,args->sites); - if ( args->msites!=m ) - args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2); - - // Set likelihoods and alternate allele frequencies - double alt_freq, pdg[3]; - if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong - - args->nused++; - - // Calculate emission probabilities P(D|AZ) and P(D|HW) - double *eprob = &args->eprob[2*args->nsites]; - eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; - eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; - - args->sites[args->nsites] = line->pos; - args->nsites++; + // parse the new line + process_line(args, line, ial); } static void usage(args_t *args) @@ -686,21 +1039,32 @@ static void usage(args_t *args) fprintf(stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); fprintf(stderr, " --AF-tag use TAG for allele frequency\n"); fprintf(stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(stderr, " -e, --estimate-AF calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in \n"); - fprintf(stderr, " -G, --GTs-only use GTs, ignore PLs, use for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n"); + fprintf(stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); + fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); + fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); + fprintf(stderr, " -e, --estimate-AF [TAG], estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); + fprintf(stderr, " in . If TAG is not given, the frequency is estimated from GT by default\n"); + fprintf(stderr, " -G, --GTs-only use GTs and ignore PLs, instead using for PL of the two least likely genotypes.\n"); + fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n"); + fprintf(stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); - fprintf(stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n"); + fprintf(stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); + fprintf(stderr, " is replaced with chromosome name\n"); fprintf(stderr, " -M, --rec-rate constant recombination rate per bp\n"); + fprintf(stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --sample sample to analyze\n"); + fprintf(stderr, " -s, --samples list of samples to analyze [all samples]\n"); + fprintf(stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --threads number of extra decompression threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "HMM Options:\n"); fprintf(stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); fprintf(stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); - fprintf(stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n"); + fprintf(stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); fprintf(stderr, "\n"); exit(1); } @@ -721,12 +1085,17 @@ int main_vcfroh(int argc, char *argv[]) {"AF-tag",1,0,0}, {"AF-file",1,0,1}, {"AF-dflt",1,0,2}, + {"buffer-size",1,0,'b'}, + {"ignore-homref",0,0,'i'}, {"estimate-AF",1,0,'e'}, + {"output",1,0,'o'}, + {"output-type",1,0,'O'}, {"GTs-only",1,0,'G'}, - {"sample",1,0,'s'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, {"hw-to-az",1,0,'a'}, {"az-to-hw",1,0,'H'}, - {"viterbi-training",0,0,'V'}, + {"viterbi-training",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, @@ -734,12 +1103,13 @@ int main_vcfroh(int argc, char *argv[]) {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, + {"threads",1,0,9}, {0,0,0,0} }; int naf_opts = 0; char *tmp; - while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) { switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; @@ -747,7 +1117,15 @@ int main_vcfroh(int argc, char *argv[]) args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; + case 'o': args->output_fname = optarg; break; + case 'O': + if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; + if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG; + if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ; + break; case 'e': args->estimate_AF = optarg; naf_opts++; break; + case 'b': args->buffer_size = optarg; break; + case 'i': args->skip_homref = 1; break; case 'I': args->snps_only = 1; break; case 'G': args->fake_PLs = 1; @@ -760,7 +1138,8 @@ int main_vcfroh(int argc, char *argv[]) args->rec_rate = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -M %s\n", optarg); break; - case 's': args->sample = strdup(optarg); break; + case 's': args->samples = strdup(optarg); break; + case 'S': args->samples = strdup(optarg); args->samples_is_file = 1; break; case 'a': args->t2AZ = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -a %s\n", optarg); @@ -773,14 +1152,28 @@ int main_vcfroh(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 'V': args->vi_training = 1; break; + case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 'V': + args->vi_training = 1; + args->baum_welch_th = strtod(optarg,&tmp); + if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg); + break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } } + if ( !args->output_fname ) args->output_fname = "stdout"; + if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; + char *fname = NULL; + if ( optind==argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + else usage(args); + } + else fname = argv[optind]; - if ( argcvi_training && args->buffer_size ) error("Error: cannot use -b with -V\n"); if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ); if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW); if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n"); @@ -800,7 +1193,9 @@ int main_vcfroh(int argc, char *argv[]) if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) error("Failed to read the targets: %s\n", args->af_fname); } - if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); + if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) + error("Failed to create threads\n"); + if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); while ( bcf_sr_next_line(args->files) ) @@ -808,7 +1203,15 @@ int main_vcfroh(int argc, char *argv[]) vcfroh(args, args->files->readers[0].buffer[0]); } vcfroh(args, NULL); - fprintf(stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); + int i, nmin = 0; + for (i=0; iroh_smpl->n; i++) + if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused; + fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin); + if ( nmin==0 ) + { + fprintf(stderr,"No usable sites were found."); + if ( !naf_opts && !args->dflt_AF ) fprintf(stderr, " Consider using one of the AF options.\n"); + } destroy_data(args); free(args); return 0; diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index 66ddc17..70ed798 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -32,12 +32,19 @@ THE SOFTWARE. */ #include #include #include +#include +#include #include "bcftools.h" #include "HMM.h" +#include "smpl_ilist.h" #define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies #define STATE_AZ 1 // autozygous state +#define OUTPUT_ST (1<<1) +#define OUTPUT_RG (1<<2) +#define OUTPUT_GZ (1<<3) + /** Genetic map */ typedef struct { @@ -46,6 +53,24 @@ typedef struct } genmap_t; +/** HMM data for each sample */ +typedef struct +{ + double *eprob; // emission probs [2*nsites,msites] + uint32_t *sites; // positions [nsites,msites] + int nsites, msites; + int igenmap; // current position in genmap + int nused; // some stats to detect if things didn't go wrong + int nrid, *rid, *rid_off; // for viterbi training, keep all chromosomes + void *snapshot; // hmm snapshot + struct { + uint32_t beg,end,nqual; + double qual; + int rid, state; + } rg; +} +smpl_t; + typedef struct _args_t { bcf_srs_t *files; @@ -59,29 +84,32 @@ typedef struct _args_t double rec_rate; // constant recombination rate if > 0 hmm_t *hmm; - double *eprob; // emission probs [2*nsites,msites] - uint32_t *sites; // positions [nsites,msites] - int nsites, msites; + double baum_welch_th; int nrids, *rids, *rid_offs; // multiple chroms with vi_training + int nbuf_max, nbuf_olap; - int32_t *itmp; - int nitmp, mitmp; float *AFs; - int mAFs; + int32_t *itmp; + int mAFs, nitmp, mitmp, pl_hdr_id, gt_hdr_id; double pl2p[256], *pdg; int32_t skip_rid, prev_rid, prev_pos; - int ntot, nused; // some stats to detect if things didn't go awfully wrong - int ismpl, nsmpl; // index of query sample - char *estimate_AF, *sample; // list of samples for AF estimate and query sample - char **argv, *targets_list, *regions_list, *af_fname, *af_tag; - int argc, fake_PLs, snps_only, vi_training; + int ntot; // some stats to detect if things didn't go wrong + smpl_t *smpl; // HMM data for each sample + smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF) + smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file) + char *estimate_AF; // list of samples for AF estimate and query sample + int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT + char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname; + int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads; + BGZF *out; + kstring_t str; } args_t; void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob); -void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob); +void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob); void *smalloc(size_t size) { @@ -92,57 +120,137 @@ void *smalloc(size_t size) static void init_data(args_t *args) { + int i; + args->prev_rid = args->skip_rid = -1; args->hdr = args->files->readers[0].header; - if ( !args->sample ) - { - if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n"); - args->sample = strdup(args->hdr->samples[0]); - } if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n"); - // Set samples - kstring_t str = {0,0,0}; - if ( args->estimate_AF && strcmp("-",args->estimate_AF) ) + if ( !args->fake_PLs ) { - int i, n; - char **smpls = hts_readlist(args->estimate_AF, 1, &n); + args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) + error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n"); + if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) + error("Error: The FORMAT/PL tag not defined as Integer in the header\n"); + } - // Make sure the query sample is included - for (i=0; isample,smpls[i]) ) break; + if ( args->estimate_AF ) + { + if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3; + else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; } + if ( strcmp("-",args->estimate_AF) ) + args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); + } - // Add the query sample if not present - if ( i!=n ) kputs(args->sample, &str); + if ( args->estimate_AF || args->fake_PLs ) + { + if ( args->af_from_PL ) + { + args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) + error("Error: The FORMAT/PL tag not found in the header\n"); + } + else + { + args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) ) + error("Error: The FORMAT/GT tag not found in the header\n"); + } + } + if ( args->fake_PLs ) + { + args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"); + if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) ) + error("Error: The FORMAT/GT tag not found in the header\n"); + } - for (i=0; iroh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); + if ( args->samples ) + { + // we may be able to subset to a few samples, for a text VCF this can be a major speedup + if ( (bcf_sr_get_reader(args->files,0))->file->format.format==vcf ) { - if ( str.l ) kputc(',', &str); - kputs(smpls[i], &str); - free(smpls[i]); + kstring_t str = {0,0,0}; + smpl_ilist_t *tmp = args->roh_smpl, *rmme = NULL; + if ( args->af_smpl ) + { + for (i=0; iroh_smpl->n; i++) + { + if ( str.l ) kputc(',', &str); + kputs(args->hdr->samples[args->roh_smpl->idx[i]], &str); + } + for (i=0; iaf_smpl->n; i++) + { + kputc(',', &str); + kputs(args->hdr->samples[args->af_smpl->idx[i]], &str); + } + rmme = tmp = smpl_ilist_init(args->hdr, str.s, 0, SMPL_NONE); + } + if ( tmp->n < bcf_hdr_nsamples(args->hdr) ) + { + str.l = 0; + for (i=0; in; i++) + { + if ( str.l ) kputc(',', &str); + kputs(args->hdr->samples[tmp->idx[i]], &str); + } + int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); + if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); + else if ( ret>0 ) error("The %d-th sample not found in the VCF: %s\n", ret,str.s); + + // update sample ids + smpl_ilist_destroy(args->roh_smpl); + args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); + + if ( args->af_smpl ) + { + smpl_ilist_destroy(args->af_smpl); + args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); + } + } + free(str.s); + if ( rmme ) + smpl_ilist_destroy(rmme); } - free(smpls); } - else if ( !args->estimate_AF ) - kputs(args->sample, &str); - if ( str.l ) + // check whether all samples are in this list. If so, the lookup will not be needed + if ( args->af_smpl && args->af_smpl->n == bcf_hdr_nsamples(args->hdr) ) { - int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); - if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); - else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); + // all samples are in this list + smpl_ilist_destroy(args->af_smpl); + args->af_smpl = NULL; } - if ( args->af_tag ) - if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) ) - error("No such INFO tag in the VCF: %s\n", args->af_tag); + if ( args->buffer_size ) + { + args->nbuf_olap = -1; + char *end; + double tmp = strtod(args->buffer_size,&end); + if ( *end ) + { + if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size); + args->nbuf_olap = strtol(end+1,&end,10); + if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size); + } + if ( tmp<0 ) + args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n; + else + args->nbuf_max = tmp; - args->nsmpl = bcf_hdr_nsamples(args->hdr); - args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample); - free(str.s); + if ( args->nbuf_olap<0 ) + args->nbuf_olap = args->nbuf_max*0.01; + } + fprintf(pysam_stderr,"Number of target samples: %d\n", args->roh_smpl->n); + fprintf(pysam_stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0)); + fprintf(pysam_stderr,"Number of sites in the buffer/overlap: "); + if ( args->nbuf_max ) fprintf(pysam_stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap); + else fprintf(pysam_stderr,"unlimited\n"); + + args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t)); - int i; for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.); // Init transition matrix and HMM @@ -152,40 +260,88 @@ static void init_data(args_t *args) MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + args->hmm = hmm_init(2, tprob, 10000); if ( args->genmap_fname ) - { - args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); - } else if ( args->rec_rate > 0 ) - { - args->hmm = hmm_init(2, tprob, 0); - hmm_set_tprob_func(args->hmm, set_tprob_recrate, args); + hmm_set_tprob_func(args->hmm, set_tprob_rrate, args); - } - else - args->hmm = hmm_init(2, tprob, 10000); + args->out = bgzf_open(strcmp("pysam_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); + if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno)); // print header - fprintf(pysam_stdout, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); - fprintf(pysam_stdout, "# The command line was:\tbcftools %s", args->argv[0]); + args->str.l = 0; + ksprintf(&args->str, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); + ksprintf(&args->str, "# The command line was:\tbcftools %s", args->argv[0]); for (i=1; iargc; i++) - fprintf(pysam_stdout, " %s",args->argv[i]); - fprintf(pysam_stdout, "\n#\n"); - fprintf(pysam_stdout, "# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); + ksprintf(&args->str, " %s",args->argv[i]); + ksprintf(&args->str, "\n#\n"); + if ( args->output_type & OUTPUT_RG ) + { + i = 2; + ksprintf(&args->str, "# RG"); + ksprintf(&args->str, "\t[%d]Sample", i++); + ksprintf(&args->str, "\t[%d]Chromosome", i++); + ksprintf(&args->str, "\t[%d]Start", i++); + ksprintf(&args->str, "\t[%d]End", i++); + ksprintf(&args->str, "\t[%d]Length (bp)", i++); + ksprintf(&args->str, "\t[%d]Number of markers", i++); + ksprintf(&args->str, "\t[%d]Quality (average fwd-bwd phred score)", i++); + ksprintf(&args->str, "\n"); + } + if ( args->output_type & OUTPUT_ST ) + { + i = 2; + ksprintf(&args->str, "# ST"); + ksprintf(&args->str, "\t[%d]Sample", i++); + ksprintf(&args->str, "\t[%d]Chromosome", i++); + ksprintf(&args->str, "\t[%d]Position", i++); + ksprintf(&args->str, "\t[%d]State (0:HW, 1:AZ)", i++); + ksprintf(&args->str, "\t[%d]Quality (fwd-bwd phred score)", i++); + ksprintf(&args->str, "\n"); + } + if ( args->vi_training) + { + i = 2; + ksprintf(&args->str, "# VT, Viterbi Training"); + ksprintf(&args->str, "\t[%d]Sample", i++); + ksprintf(&args->str, "\t[%d]Iteration", i++); + ksprintf(&args->str, "\t[%d]dAZ", i++); + ksprintf(&args->str, "\t[%d]dHW", i++); + ksprintf(&args->str, "\t[%d]1 - P(HW|HW)", i++); + ksprintf(&args->str, "\t[%d]P(AZ|HW)", i++); + ksprintf(&args->str, "\t[%d]1 - P(AZ|AZ)", i++); + ksprintf(&args->str, "\t[%d]P(HW|AZ)", i++); + ksprintf(&args->str, "\n"); + } + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) + error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } static void destroy_data(args_t *args) { - free(args->sites); - free(args->eprob); - free(args->sample); + if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname); + int i; + for (i=0; iroh_smpl->n; i++) + { + free(args->smpl[i].eprob); + free(args->smpl[i].sites); + free(args->smpl[i].rid); + free(args->smpl[i].rid_off); + free(args->smpl[i].snapshot); + } + free(args->str.s); + free(args->smpl); + if ( args->af_smpl ) smpl_ilist_destroy(args->af_smpl); + smpl_ilist_destroy(args->roh_smpl); free(args->rids); free(args->rid_offs); hmm_destroy(args->hmm); bcf_sr_destroy(args->files); - free(args->itmp); free(args->AFs); free(args->pdg); + free(args->AFs); free(args->pdg); free(args->genmap); + free(args->itmp); + free(args->samples); } static int load_genmap(args_t *args, bcf1_t *line) @@ -222,21 +378,22 @@ static int load_genmap(args_t *args, bcf1_t *line) hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap); genmap_t *gm = &args->genmap[args->ngenmap-1]; + // position, convert to 0-based char *tmp, *end; gm->pos = strtol(str.s, &tmp, 10); if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s); + gm->pos -= 1; // skip second column tmp++; while ( *tmp && !isspace(*tmp) ) tmp++; - // read the genetic map in cM + // read the genetic map in cM, scale from % to likelihood gm->rate = strtod(tmp+1, &end); if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s); + gm->rate *= 0.01; } if ( !args->ngenmap ) error("Genetic map empty?\n"); - int i; - for (i=0; ingenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1 if ( hts_close(fp) ) error("Close failed\n"); free(str.s); return 0; @@ -257,7 +414,6 @@ static double get_genmap_rate(args_t *args, int start, int end) // position j to be equal or larger than end int j = i; while ( j+1ngenmap && args->genmap[j].pos < end ) j++; - if ( i==j ) { args->igenmap = i; @@ -274,17 +430,20 @@ static double get_genmap_rate(args_t *args, int start, int end) void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob) { args_t *args = (args_t*) data; - double ci = get_genmap_rate(args, pos - prev_pos, pos); + double ci = get_genmap_rate(args, prev_pos, pos); + if ( args->rec_rate ) ci *= args->rec_rate; + if ( ci > 1 ) ci = 1; MAT(tprob,2,STATE_HW,STATE_AZ) *= ci; MAT(tprob,2,STATE_AZ,STATE_HW) *= ci; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ); MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW); } -void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob) +void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob) { args_t *args = (args_t*) data; double ci = (pos - prev_pos) * args->rec_rate; + if ( ci > 1 ) ci = 1; MAT(tprob,2,STATE_HW,STATE_AZ) *= ci; MAT(tprob,2,STATE_AZ,STATE_HW) *= ci; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ); @@ -317,132 +476,163 @@ void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, * */ -static void flush_viterbi(args_t *args) +static void flush_viterbi(args_t *args, int ismpl) { - int i,j; + smpl_t *smpl = &args->smpl[ismpl]; + if ( !smpl->nsites ) return; - if ( !args->nsites ) return; + const char *name = args->hdr->samples[ args->roh_smpl->idx[ismpl] ]; - if ( !args->vi_training ) + int i,j,k; + + if ( !args->vi_training ) // single viterbi pass { - // single viterbi pass, one chromsome - hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites); - hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites); + hmm_restore(args->hmm, smpl->snapshot); + int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites; + if ( end < smpl->nsites ) + smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1); + + args->igenmap = smpl->igenmap; + hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites); + hmm_run_fwd_bwd(args->hmm, smpl->nsites, smpl->eprob, smpl->sites); double *fwd = hmm_get_fwd_bwd_prob(args->hmm); - const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); - uint8_t *vpath = hmm_get_viterbi_path(args->hmm); - for (i=0; insites; i++) + const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); + uint8_t *vpath = hmm_get_viterbi_path(args->hmm); + + for (i=0; isites[i]+1, state, phred_score(1.0-pval[state])); - } - return; - } + double qual = phred_score(1.0 - fwd[i*2 + state]); + if ( args->output_type & OUTPUT_ST ) + { + args->str.l = 0; + ksprintf(&args->str, "ST\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[i]+1, state, qual); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); + } - // viterbi training, multiple chromosomes - double t2az_prev, t2hw_prev; - double deltaz, delthw; - int niter = 0; - do - { - double *tprob_arr = hmm_get_tprob(args->hmm); - t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ; - t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW; - double tcounts[] = { 0,0,0,0 }; - for (i=0; inrids; i++) - { - // run viterbi for each chromosomes. eprob and sites contain - // multiple chromosomes, rid_offs mark the boundaries - int ioff = args->rid_offs[i]; - int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; - hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); - - // what transitions were observed: add to the total counts - uint8_t *vpath = hmm_get_viterbi_path(args->hmm); - for (j=1; joutput_type & OUTPUT_RG ) { - // count the number of transitions - int prev_state = vpath[2*(j-1)]; - int curr_state = vpath[2*j]; - MAT(tcounts,2,curr_state,prev_state) += 1; + if ( state!=smpl->rg.state ) + { + if ( !state ) // the region ends, flush + { + args->str.l = 0; + ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid), + smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); + smpl->rg.state = 0; + } + else + { + smpl->rg.state = 1; + smpl->rg.beg = smpl->sites[i]; + smpl->rg.rid = args->prev_rid; + } + } + else if ( state ) + { + smpl->rg.nqual++; + smpl->rg.qual += qual; + smpl->rg.end = smpl->sites[i]; + } } } - // update the transition matrix - int n = 1; - for (i=0; i<2; i++) + if ( end < smpl->nsites ) { - for (j=0; j<2; j++) n += MAT(tcounts,2,i,j); + end = smpl->nsites - args->nbuf_olap; + memmove(smpl->sites, smpl->sites + end, sizeof(*smpl->sites)*args->nbuf_olap); + memmove(smpl->eprob, smpl->eprob + end*2, sizeof(*smpl->eprob)*args->nbuf_olap*2); + smpl->nsites = args->nbuf_olap; + smpl->igenmap = args->igenmap; } - for (i=0; i<2; i++) + else { - for (j=0; j<2; j++) + smpl->nsites = 0; + smpl->igenmap = 0; + + if ( smpl->rg.state ) { - // no transition to i-th state was observed, set to a small number - if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n; - else MAT(tcounts,2,i,j) /= n; + args->str.l = 0; + ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid), + smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); + smpl->rg.state = 0; } } - // normalize - for (i=0; i<2; i++) + return; + } + + + // viterbi training, multiple chromosomes + double t2az_prev, t2hw_prev; + double deltaz, delthw; + + double *tprob_arr = hmm_get_tprob(args->hmm); + MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; + MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW; + MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ; + MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + hmm_set_tprob(args->hmm, tprob_arr, 10000); + + int niter = 0; + do + { + tprob_arr = hmm_get_tprob(args->hmm); + t2az_prev = MAT(tprob_arr,2,STATE_AZ,STATE_HW); //args->t2AZ; + t2hw_prev = MAT(tprob_arr,2,STATE_HW,STATE_AZ); //args->t2HW; + double tprob_new[] = { 0,0,0,0 }; + for (i=0; inrid; i++) { - double norm = 0; - for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i); - assert( norm!=0 ); - for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm; + int ioff = smpl->rid_off[i]; + int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff; + args->igenmap = 0; + tprob_arr = hmm_run_baum_welch(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff); + for (j=0; j<2; j++) + for (k=0; k<2; k++) MAT(tprob_new,2,j,k) += MAT(tprob_arr,2,j,k); } + for (j=0; j<2; j++) + for (k=0; k<2; k++) MAT(tprob_new,2,j,k) /= smpl->nrid; - if ( args->genmap_fname || args->rec_rate > 0 ) - hmm_set_tprob(args->hmm, tcounts, 0); - else - hmm_set_tprob(args->hmm, tcounts, 10000); + hmm_set_tprob(args->hmm, tprob_new, 10000); - tprob_arr = hmm_get_tprob(args->hmm); - deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev); - delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev); + deltaz = fabs(MAT(tprob_new,2,STATE_AZ,STATE_HW)-t2az_prev); + delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev); niter++; - fprintf(pysam_stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", - niter,deltaz,delthw, - MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), - MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); + args->str.l = 0; + ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", + name,niter,deltaz,delthw, + 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW), + 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ)); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } - while ( deltaz > 0.0 || delthw > 0.0 ); - double *tprob_arr = hmm_get_tprob(args->hmm); - fprintf(pysam_stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter, - MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), - MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); + while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th ); // output the results - for (i=0; inrids; i++) + for (i=0; inrid; i++) { - int ioff = args->rid_offs[i]; - int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; - hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); - hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); + int ioff = smpl->rid_off[i]; + int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff; + args->igenmap = 0; + hmm_run_viterbi(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff); + hmm_run_fwd_bwd(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); double *fwd = hmm_get_fwd_bwd_prob(args->hmm); - const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]); + const char *chr = bcf_hdr_id2name(args->hdr,smpl->rid[i]); for (j=0; jsites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval); + int state = vpath[j*2]==STATE_AZ ? 1 : 0; + double *pval = fwd + j*2; + args->str.l = 0; + ksprintf(&args->str, "ROH\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[ioff+j]+1, state, phred_score(1.0-pval[state])); + if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } } } -static void push_rid(args_t *args, int rid) -{ - args->nrids++; - args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int)); - args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int)); - args->rids[ args->nrids-1 ] = rid; - args->rid_offs[ args->nrids-1 ] = args->nsites; -} int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { @@ -470,27 +660,52 @@ int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) return 0; } -int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq) +int8_t *get_GT(args_t *args, bcf1_t *line) { - if ( !args->nitmp ) - { - args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); - if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? - args->nitmp /= args->nsmpl; - } + int i; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==args->gt_hdr_id ) break; + if ( i==line->n_fmt ) return NULL; // the tag is not present in this record + + bcf_fmt_t *fmt = &line->d.fmt[i]; + if ( fmt->n!=2 ) return NULL; // not diploid + if ( fmt->type!=BCF_BT_INT8 ) error("This is unexpected, GT type is %d\n", fmt->type); + return (int8_t*) fmt->p; +} + +int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq) +{ int i, nalt = 0, nref = 0; - for (i=0; insmpl; i++) + if ( args->af_smpl ) // subset samples for AF estimate { - int32_t *gt = &args->itmp[i*args->nitmp]; + for (i=0; iaf_smpl->n; i++) + { + int ismpl = args->af_smpl->idx[i]; + if ( bcf_gt_is_missing(gt[2*ismpl]) || bcf_gt_is_missing(gt[2*ismpl+1]) ) continue; - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + if ( bcf_gt_allele(gt[2*ismpl]) ) nalt++; + else nref++; - if ( bcf_gt_allele(gt[0]) ) nalt++; - else nref++; + if ( bcf_gt_allele(gt[2*ismpl+1]) ) nalt++; + else nref++; + } + } + else // all samples used in AF estimate + { + int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr); + while ( gt < end ) + { + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + + if ( bcf_gt_allele(gt[0]) ) nalt++; + else nref++; + + if ( bcf_gt_allele(gt[1]) ) nalt++; + else nref++; - if ( bcf_gt_allele(gt[1]) ) nalt++; - else nref++; + gt += 2; + } } if ( !nalt && !nref ) return -1; @@ -498,105 +713,249 @@ int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq) return 0; } +int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_freq) +{ + double af = 0; + int i, j, naf = 0; + + int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); + if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields + + if ( args->af_smpl ) // subset samples for AF estimate + { + #define BRANCH(type_t) \ + { \ + for (i=0; iaf_smpl->n; i++) \ + { \ + int ismpl = args->af_smpl->idx[i]; \ + type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ + if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ + if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ + double prob[3], norm = 0; \ + prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ + prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ + prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ + for (j=0; j<3; j++) norm += prob[j]; \ + for (j=0; j<3; j++) prob[j] /= norm; \ + af += 0.5*prob[1] + prob[2]; \ + naf++; \ + } \ + } + switch (fmt_pl->type) { + case BCF_BT_INT8: BRANCH(int8_t); break; + case BCF_BT_INT16: BRANCH(int16_t); break; + case BCF_BT_INT32: BRANCH(int32_t); break; + default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + } + #undef BRANCH + } + else // all samples used in AF estimate + { + int nsmpl = bcf_hdr_nsamples(args->hdr); + #define BRANCH(type_t) \ + { \ + type_t *p = (type_t*)fmt_pl->p; \ + p -= fmt_pl->n; \ + for (i=0; in; \ + if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ + if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ + double prob[3], norm = 0; \ + prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ + prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ + prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ + for (j=0; j<3; j++) norm += prob[j]; \ + for (j=0; j<3; j++) prob[j] /= norm; \ + af += 0.5*prob[1] + prob[2]; \ + naf++; \ + } \ + } + switch (fmt_pl->type) { + case BCF_BT_INT8: BRANCH(int8_t); break; + case BCF_BT_INT16: BRANCH(int16_t); break; + case BCF_BT_INT32: BRANCH(int32_t); break; + default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + } + #undef BRANCH + } + if ( !naf ) return -1; + + *alt_freq = af / naf; + return 0; +} + +bcf_fmt_t *get_PL(args_t *args, bcf1_t *line) +{ + int i; + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==args->pl_hdr_id ) return &line->d.fmt[i]; + return NULL; +} -int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) +int process_line(args_t *args, bcf1_t *line, int ial) { - args->nitmp = 0; + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + + double alt_freq; + int8_t *GTs = NULL; + bcf_fmt_t *fmt_pl = NULL; // Set allele frequency - int ret; + int ret = 0, i,j; if ( args->af_tag ) { // Use an INFO tag provided by the user ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs); - if ( ret==1 ) - *alt_freq = args->AFs[0]; + if ( ret>0 ) + alt_freq = args->AFs[ial-1]; if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); } else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args->files->targets, line, alt_freq); + ret = read_AF(args->files->targets, line, &alt_freq); + } + else if ( args->dflt_AF > 0 ) + { + alt_freq = args->dflt_AF; + } + else if ( args->estimate_AF ) + { + // Estimate AF from GTs or PLs of all samples or samples listed in a file + if ( args->af_from_PL ) + { + fmt_pl = get_PL(args, line); + if ( !fmt_pl ) return -1; + ret = estimate_AF_from_PL(args, fmt_pl, ial, &alt_freq); + } + else + { + GTs = get_GT(args, line); + if ( !GTs ) return -1; + ret = estimate_AF_from_GT(args, GTs, &alt_freq); + } } else { - // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF - ret = -1; - if ( !args->estimate_AF ) + // Use AC/AN + int AC = -1, AN = 0; + ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); + if ( ret==1 ) { - int AC = -1, AN = 0; - ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); - if ( ret==1 ) - { - AN = args->itmp[0]; - ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); - if ( ret>0 ) - AC = args->itmp[0]; - } - if ( AN<=0 || AC<0 ) - ret = -1; - else - *alt_freq = (double) AC/AN; + AN = args->itmp[0]; + ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); + if ( ret>0 ) + AC = args->itmp[0]; } - if ( ret==-1 ) - ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp + if ( AN<=0 || AC<0 ) + ret = -1; + else + alt_freq = (double) AC/AN; } if ( ret<0 ) return ret; - if ( *alt_freq==0.0 ) - { - if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 - *alt_freq = args->dflt_AF; - } + if ( alt_freq==0.0 ) return -1; - // Set P(D|G) + int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); if ( args->fake_PLs ) { - if ( !args->nitmp ) - { - args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); - if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? - args->nitmp /= args->nsmpl; - } + if ( !GTs ) GTs = get_GT(args, line); + } + else + { + fmt_pl = get_PL(args, line); + if ( !fmt_pl ) return -1; + if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields + } - int32_t *gt = &args->itmp[args->ismpl*args->nitmp]; - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1; + for (i=0; iroh_smpl->n; i++) + { + int ismpl = args->roh_smpl->idx[i]; - int a = bcf_gt_allele(gt[0]); - int b = bcf_gt_allele(gt[1]); - if ( a!=b ) - { - pdg[0] = pdg[2] = args->unseen_PL; - pdg[1] = 1 - 2*args->unseen_PL; - } - else if ( a==0 ) + // set P(D|G) + double pdg[3]; + if ( args->fake_PLs ) { - pdg[0] = 1 - 2*args->unseen_PL; - pdg[1] = pdg[2] = args->unseen_PL; + int8_t *gt = GTs + 2*ismpl; + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + + int a = bcf_gt_allele(gt[0]); + int b = bcf_gt_allele(gt[1]); + if ( a!=b ) + { + pdg[0] = pdg[2] = args->unseen_PL; + pdg[1] = 1 - 2*args->unseen_PL; + } + else if ( a==0 ) + { + pdg[0] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL; + pdg[1] = args->unseen_PL; + pdg[2] = args->unseen_PL*args->unseen_PL; + } + else + { + pdg[0] = args->unseen_PL*args->unseen_PL; + pdg[1] = args->unseen_PL; + pdg[2] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL; + } } else { - pdg[0] = pdg[1] = args->unseen_PL; - pdg[2] = 1 - 2*args->unseen_PL; + #define BRANCH(type_t) \ + { \ + type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ + if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ + if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ + pdg[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ + pdg[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ + pdg[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ + } + switch (fmt_pl->type) { + case BCF_BT_INT8: BRANCH(int8_t); break; + case BCF_BT_INT16: BRANCH(int16_t); break; + case BCF_BT_INT32: BRANCH(int32_t); break; + default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + } + #undef BRANCH } - } - else - { - args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp); - if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid? - args->nitmp /= args->nsmpl; - - int32_t *pl = &args->itmp[args->ismpl*args->nitmp]; - pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0; - pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0; - pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0; double sum = pdg[0] + pdg[1] + pdg[2]; - if ( !sum ) return -1; - pdg[0] /= sum; - pdg[1] /= sum; - pdg[2] /= sum; + if ( !sum ) continue; + for (j=0; j<3; j++) pdg[j] /= sum; + if ( args->skip_homref && pdg[0]>0.99 ) continue; + + smpl_t *smpl = &args->smpl[i]; + smpl->nused++; + + if ( smpl->nsites >= smpl->msites ) + { + hts_expand(uint32_t,smpl->nsites+1,smpl->msites,smpl->sites); + smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2); + if ( !smpl->eprob ) error("Error: failed to alloc %d bytes\n", sizeof(*smpl->eprob)*smpl->msites*2); + } + + // Calculate emission probabilities P(D|AZ) and P(D|HW) + double *eprob = &smpl->eprob[2*smpl->nsites]; + eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; + eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; + + smpl->sites[smpl->nsites] = line->pos; + smpl->nsites++; + + if ( args->vi_training ) + { + if ( !smpl->nrid || line->rid!=smpl->rid[smpl->nrid-1] ) + { + smpl->nrid++; + smpl->rid = (int*) realloc(smpl->rid,sizeof(*smpl->rid)*smpl->nrid); + smpl->rid[smpl->nrid-1] = line->rid; + smpl->rid_off = (int*) realloc(smpl->rid_off,sizeof(*smpl->rid_off)*smpl->nrid); + smpl->rid_off[smpl->nrid-1] = smpl->nsites - 1; + } + } + else if ( args->nbuf_max && smpl->nsites >= args->nbuf_max ) flush_viterbi(args, i); } return 0; @@ -604,18 +963,35 @@ int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) static void vcfroh(args_t *args, bcf1_t *line) { + int i; + // Are we done? if ( !line ) { - flush_viterbi(args); + for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); return; } args->ntot++; - // Skip unwanted lines + // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; if ( line->n_allele==1 ) return; // no ALT allele - if ( line->n_allele!=2 ) return; // only biallelic sites + if ( line->n_allele > 3 ) return; // cannot be bi-allelic, even with <*> + + // This can be raw callable VCF with the symbolic unseen allele <*> + int ial = 0; + for (i=1; in_allele; i++) + if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; } + if ( ial==0 ) // normal VCF, the symbolic allele is not present + { + if ( line->n_allele!=2 ) return; // not biallelic + ial = 1; + } + else + { + if ( line->n_allele!=3 ) return; // not biallelic + ial = ial==1 ? 2 : 1; // <*> can come in any order + } if ( args->snps_only && !bcf_is_snp(line) ) return; // Initialize genetic map @@ -625,21 +1001,15 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_rid = line->rid; args->prev_pos = line->pos; skip_rid = load_genmap(args, line); - if ( !skip_rid && args->vi_training ) push_rid(args, line->rid); } // New chromosome? if ( args->prev_rid!=line->rid ) { skip_rid = load_genmap(args, line); - if ( args->vi_training ) - { - if ( !skip_rid ) push_rid(args, line->rid); - } - else + if ( !args->vi_training ) { - flush_viterbi(args); - args->nsites = 0; + for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); } args->prev_rid = line->rid; args->prev_pos = line->pos; @@ -657,25 +1027,8 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_pos = line->pos; - // Ready for the new site - int m = args->msites; - hts_expand(uint32_t,args->nsites+1,args->msites,args->sites); - if ( args->msites!=m ) - args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2); - - // Set likelihoods and alternate allele frequencies - double alt_freq, pdg[3]; - if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong - - args->nused++; - - // Calculate emission probabilities P(D|AZ) and P(D|HW) - double *eprob = &args->eprob[2*args->nsites]; - eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; - eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; - - args->sites[args->nsites] = line->pos; - args->nsites++; + // parse the new line + process_line(args, line, ial); } static void usage(args_t *args) @@ -688,21 +1041,32 @@ static void usage(args_t *args) fprintf(pysam_stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); fprintf(pysam_stderr, " --AF-tag use TAG for allele frequency\n"); fprintf(pysam_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(pysam_stderr, " -e, --estimate-AF calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in \n"); - fprintf(pysam_stderr, " -G, --GTs-only use GTs, ignore PLs, use for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n"); + fprintf(pysam_stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); + fprintf(pysam_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); + fprintf(pysam_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); + fprintf(pysam_stderr, " -e, --estimate-AF [TAG], estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); + fprintf(pysam_stderr, " in . If TAG is not given, the frequency is estimated from GT by default\n"); + fprintf(pysam_stderr, " -G, --GTs-only use GTs and ignore PLs, instead using for PL of the two least likely genotypes.\n"); + fprintf(pysam_stderr, " Safe value to use is 30 to account for GT errors.\n"); + fprintf(pysam_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); fprintf(pysam_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); - fprintf(pysam_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n"); + fprintf(pysam_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); + fprintf(pysam_stderr, " is replaced with chromosome name\n"); fprintf(pysam_stderr, " -M, --rec-rate constant recombination rate per bp\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --sample sample to analyze\n"); + fprintf(pysam_stderr, " -s, --samples list of samples to analyze [all samples]\n"); + fprintf(pysam_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " --threads number of extra decompression threads [0]\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "HMM Options:\n"); fprintf(pysam_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); fprintf(pysam_stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); - fprintf(pysam_stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n"); + fprintf(pysam_stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); fprintf(pysam_stderr, "\n"); exit(1); } @@ -723,12 +1087,17 @@ int main_vcfroh(int argc, char *argv[]) {"AF-tag",1,0,0}, {"AF-file",1,0,1}, {"AF-dflt",1,0,2}, + {"buffer-size",1,0,'b'}, + {"ignore-homref",0,0,'i'}, {"estimate-AF",1,0,'e'}, + {"output",1,0,'o'}, + {"output-type",1,0,'O'}, {"GTs-only",1,0,'G'}, - {"sample",1,0,'s'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, {"hw-to-az",1,0,'a'}, {"az-to-hw",1,0,'H'}, - {"viterbi-training",0,0,'V'}, + {"viterbi-training",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, @@ -736,12 +1105,13 @@ int main_vcfroh(int argc, char *argv[]) {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, + {"threads",1,0,9}, {0,0,0,0} }; int naf_opts = 0; char *tmp; - while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) { switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; @@ -749,7 +1119,15 @@ int main_vcfroh(int argc, char *argv[]) args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; + case 'o': args->output_fname = optarg; break; + case 'O': + if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; + if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG; + if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ; + break; case 'e': args->estimate_AF = optarg; naf_opts++; break; + case 'b': args->buffer_size = optarg; break; + case 'i': args->skip_homref = 1; break; case 'I': args->snps_only = 1; break; case 'G': args->fake_PLs = 1; @@ -762,7 +1140,8 @@ int main_vcfroh(int argc, char *argv[]) args->rec_rate = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -M %s\n", optarg); break; - case 's': args->sample = strdup(optarg); break; + case 's': args->samples = strdup(optarg); break; + case 'S': args->samples = strdup(optarg); args->samples_is_file = 1; break; case 'a': args->t2AZ = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -a %s\n", optarg); @@ -775,14 +1154,28 @@ int main_vcfroh(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 'V': args->vi_training = 1; break; + case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 'V': + args->vi_training = 1; + args->baum_welch_th = strtod(optarg,&tmp); + if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg); + break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } } + if ( !args->output_fname ) args->output_fname = "pysam_stdout"; + if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; + char *fname = NULL; + if ( optind==argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + else usage(args); + } + else fname = argv[optind]; - if ( argcvi_training && args->buffer_size ) error("Error: cannot use -b with -V\n"); if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ); if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW); if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n"); @@ -802,7 +1195,9 @@ int main_vcfroh(int argc, char *argv[]) if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) error("Failed to read the targets: %s\n", args->af_fname); } - if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); + if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) + error("Failed to create threads\n"); + if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); while ( bcf_sr_next_line(args->files) ) @@ -810,7 +1205,15 @@ int main_vcfroh(int argc, char *argv[]) vcfroh(args, args->files->readers[0].buffer[0]); } vcfroh(args, NULL); - fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); + int i, nmin = 0; + for (i=0; iroh_smpl->n; i++) + if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused; + fprintf(pysam_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin); + if ( nmin==0 ) + { + fprintf(pysam_stderr,"No usable sites were found."); + if ( !naf_opts && !args->dflt_AF ) fprintf(pysam_stderr, " Consider using one of the AF options.\n"); + } destroy_data(args); free(args); return 0; diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 1032bf8..4041a5a 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2015 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -39,6 +39,7 @@ THE SOFTWARE. */ #include #include "bcftools.h" #include "filter.h" +#include "bin.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -67,17 +68,6 @@ typedef struct } idist_t; -typedef struct -{ - double x; - double x2; - double y; - double y2; - double xy; - double n; -} -smpl_r_t; - typedef struct { int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; @@ -108,9 +98,14 @@ stats_t; typedef struct { - uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches - float r2sum; - uint32_t r2n; + uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats + /* + Pearson's R^2 is used for aggregate R^2 + y, yy .. sum of dosage and squared dosage in the query VCF (second file) + x, xx .. sum of squared dosage in the truth VCF (first file) + n .. number of genotypes + */ + double y, yy, x, xx, yx, n; } gtcmp_t; @@ -135,7 +130,11 @@ typedef struct int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm; uint8_t *tmp_frm; int dp_min, dp_max, dp_step; - gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons + gtcmp_t *smpl_gts_snps, *smpl_gts_indels; + gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons + bin_t *af_bins; + float *farr; + int mfarr; // indel context indel_ctx_t *indel_ctx; @@ -148,21 +147,18 @@ typedef struct // other bcf_srs_t *files; bcf_sr_regions_t *exons; - char **argv, *exons_fname, *regions_list, *samples_list, *targets_list; + char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag; int argc, verbose_sites, first_allele_only, samples_is_file; int split_by_id, nstats; filter_t *filter[2]; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE - - // Per Sample r working data arrays of size equal to number of samples - smpl_r_t* smpl_r_snps; - smpl_r_t* smpl_r_indels; + int n_threads; } args_t; -static int type2dosage[6], type2ploidy[6], type2stats[6]; +static int type2dosage[6], type2ploidy[6], type2stats[7]; static void idist_init(idist_t *d, int min, int max, int step) { @@ -187,6 +183,12 @@ static inline int idist_i2bin(idist_t *d, int i) return i-1+d->min; } +static inline int clip_nonnegative(float x, int limit) +{ + if (x >= limit || isnan(x)) return limit - 1; + else if (x <= 0.0) return 0; + else return (int) x; +} #define IC_DBG 0 #if IC_DBG @@ -403,13 +405,30 @@ static void init_stats(args_t *args) args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str); if ( args->files->nreaders==2 ) args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str); + args->files->max_unpack |= filter_max_unpack(args->filter[0]); + } + + // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs + if ( !args->af_bins_list ) + { + args->m_af = 101; + for (i=0; ifiles->nreaders; i++) + if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af ) + args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1; + } + else + { + args->af_bins = bin_init(args->af_bins_list,0,1); + + // m_af is used also for other af arrays, where the first bin is for + // singletons. However, since the last element is unused in af_bins + // (n boundaries form n-1 intervals), the m_af count is good for both. + args->m_af = bin_get_size(args->af_bins); } - // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs - args->m_af = 101; - for (i=0; ifiles->nreaders; i++) - if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af ) - args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1; + bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); + if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) ) + error("No such INFO tag: %s\n", args->af_tag); #if QUAL_STATS args->m_qual = 999; @@ -430,8 +449,6 @@ static void init_stats(args_t *args) args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t)); args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t)); args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t)); - args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t)); - args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t)); } for (i=0; instats; i++) { @@ -503,9 +520,10 @@ static void init_stats(args_t *args) type2stats[GT_HOM_RR] = 0; type2stats[GT_HET_RA] = 1; type2stats[GT_HOM_AA] = 2; - type2stats[GT_HET_AA] = 1; + type2stats[GT_HET_AA] = 3; type2stats[GT_HAPL_R] = 0; type2stats[GT_HAPL_A] = 2; + type2stats[GT_UNKN] = 4; } static void destroy_stats(args_t *args) @@ -526,7 +544,6 @@ static void destroy_stats(args_t *args) if (stats->qual_indels) free(stats->qual_indels); #endif #if HWE_STATS - //if ( args->files->n_smpl ) free(stats->af_hwe); free(stats->af_hwe); #endif free(stats->insertions); @@ -554,6 +571,8 @@ static void destroy_stats(args_t *args) if ( args->exons ) free(stats->smpl_frm_shifts); } for (j=0; jnusr; j++) free(args->usr[j].tag); + if ( args->af_bins ) bin_destroy(args->af_bins); + free(args->farr); free(args->usr); free(args->tmp_frm); free(args->tmp_iaf); @@ -562,8 +581,6 @@ static void destroy_stats(args_t *args) free(args->af_gts_indels); free(args->smpl_gts_snps); free(args->smpl_gts_indels); - free(args->smpl_r_snps); - free(args->smpl_r_indels); if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx); if (args->filter[0]) filter_destroy(args->filter[0]); if (args->filter[1]) filter_destroy(args->filter[1]); @@ -572,36 +589,59 @@ static void destroy_stats(args_t *args) static void init_iaf(args_t *args, bcf_sr_t *reader) { bcf1_t *line = reader->buffer[0]; - if ( args->ntmp_iaf < line->n_allele ) + hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf); + + int i, ret; + if ( args->af_tag ) { - args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int)); - args->ntmp_iaf = line->n_allele; + ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr); + if ( ret<=0 || ret!=line->n_allele-1 ) + { + // the AF tag is not present or wrong number of values, put in the singletons/unknown bin + for (i=0; in_allele; i++) args->tmp_iaf[i] = 0; + return; + } + args->tmp_iaf[0] = 0; + for (i=1; in_allele; i++) + { + float af = args->farr[i-1]; + if ( af<0 ) af = 0; + else if ( af>1 ) af = 1; + int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2); + args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons + } + return; } + // tmp_iaf is first filled with AC counts in calc_ac and then transformed to // an index to af_gts_snps - int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO); - if ( ret ) + ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO); + if ( !ret ) { - int an=0; - for (i=0; in_allele; i++) - an += args->tmp_iaf[i]; + for (i=0; in_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin + return; + } - args->tmp_iaf[0] = 0; - for (i=1; in_allele; i++) + int an = 0; + for (i=0; in_allele; i++) + an += args->tmp_iaf[i]; + + args->tmp_iaf[0] = 0; + for (i=1; in_allele; i++) + { + if ( args->tmp_iaf[i]==1 ) + args->tmp_iaf[i] = 0; // singletons into the first bin + else if ( !an ) + args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin + else { - if ( args->tmp_iaf[i]==1 ) - args->tmp_iaf[i] = 0; // singletons into the first bin - else if ( !an ) - args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin - else - args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an; + float af = (float) args->tmp_iaf[i] / an; + if ( af<0 ) af = 0; + else if ( af>1 ) af = 1; + int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2); + args->tmp_iaf[i] = iaf + 1; } } - else - for (i=0; in_allele; i++) - args->tmp_iaf[i] = 0; - - // todo: otherwise use AF } static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) @@ -621,7 +661,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) bcf1_t *line = reader->buffer[0]; #if QUAL_STATS - int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual; + int iqual = clip_nonnegative(line->qual, args->m_qual); stats->qual_indels[iqual]++; #endif @@ -756,7 +796,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) if ( ref<0 ) return; #if QUAL_STATS - int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual; + int iqual = clip_nonnegative(line->qual, args->m_qual); stats->qual_snps[iqual]++; #endif @@ -873,6 +913,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int { float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot); int idx = het_frac*(args->naf_hwe - 1); +//check me: what is this? if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1]; stats->af_hwe[idx]++; } @@ -911,88 +952,42 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; // only the first ALT allele is considered - int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1; + int iaf = args->tmp_iaf[1]; int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels; - // - // Calculates r squared - // x is mean dosage of x at given site - // x2 is mean squared dosage of x at given site - // y is mean dosage of x at given site - // y2 is mean squared dosage of x at given site - // xy is mean dosage of x*y at given site - // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) ) - // r2n is number of sites considered - // output as r2sum/r2n for each AF bin - int r2n = 0; - float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0; - // Select smpl_r - smpl_r_t *smpl_r = NULL; - if (line_type&VCF_SNP) - { - smpl_r = args->smpl_r_snps; - } - else if (line_type&VCF_INDEL) - { - smpl_r = args->smpl_r_indels; - } for (is=0; isn_smpl; is++) { // Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of // actual alleles can be enforced by running without the -c option. int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL); - if ( gt0 == GT_UNKN ) continue; - int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL); - if ( gt1 == GT_UNKN ) continue; - if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes + int idx0 = type2stats[gt0]; + int idx1 = type2stats[gt1]; + af_stats[iaf].gt2gt[idx0][idx1]++; + smpl_stats[is].gt2gt[idx0][idx1]++; - int dsg0 = type2dosage[gt0]; - int dsg1 = type2dosage[gt1]; - x += dsg0; - x2 += dsg0*dsg0; - y += dsg1; - y2 += dsg1*dsg1; - xy += dsg0*dsg1; - r2n++; - - int idx = type2stats[gt0]; - if ( gt0==gt1 ) - { - af_stats[iaf].m[idx]++; - smpl_stats[is].m[idx]++; - } - else - { - af_stats[iaf].mm[idx]++; - smpl_stats[is].mm[idx]++; - } - - // Now do it across samples + if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue; + if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes - if (smpl_r) { - smpl_r[is].xy += dsg0*dsg1; - smpl_r[is].x += dsg0; - smpl_r[is].x2 += dsg0*dsg0; - smpl_r[is].y += dsg1; - smpl_r[is].y2 += dsg1*dsg1; - ++(smpl_r[is].n); - } - } - - if ( r2n ) - { - x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n; - float cov = xy - x*y; - float var2 = (x2 - x*x) * (y2 - y*y); - if ( var2!=0 ) - { - af_stats[iaf].r2sum += cov*cov/var2; - af_stats[iaf].r2n++; - } + float y = type2dosage[gt0]; + float x = type2dosage[gt1]; + + smpl_stats[is].yx += y*x; + smpl_stats[is].x += x; + smpl_stats[is].xx += x*x; + smpl_stats[is].y += y; + smpl_stats[is].yy += y*y; + smpl_stats[is].n += 1; + + af_stats[iaf].yx += y*x; + af_stats[iaf].x += x; + af_stats[iaf].xx += x*x; + af_stats[iaf].y += y; + af_stats[iaf].yy += y*y; + af_stats[iaf].n += 1; } if ( args->verbose_sites ) @@ -1129,7 +1124,7 @@ static void print_header(args_t *args) #define T2S(x) type2stats[x] static void print_stats(args_t *args) { - int i, id; + int i, j,k, id; printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); @@ -1202,6 +1197,24 @@ static void print_stats(args_t *args) stats->af_repeats[1][1] += stats->af_repeats[1][0]; stats->af_repeats[2][1] += stats->af_repeats[2][0]; } + // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf + if ( args->af_gts_snps ) + { + args->af_gts_snps[1].y += args->af_gts_snps[0].y; + args->af_gts_snps[1].yy += args->af_gts_snps[0].yy; + args->af_gts_snps[1].xx += args->af_gts_snps[0].xx; + args->af_gts_snps[1].yx += args->af_gts_snps[0].yx; + args->af_gts_snps[1].n += args->af_gts_snps[0].n; + } + if ( args->af_gts_indels ) + { + args->af_gts_indels[1].y += args->af_gts_indels[0].y; + args->af_gts_indels[1].yy += args->af_gts_indels[0].yy; + args->af_gts_indels[1].xx += args->af_gts_indels[0].xx; + args->af_gts_indels[1].yx += args->af_gts_indels[0].yx; + args->af_gts_indels[1].n += args->af_gts_indels[0].n; + } + printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { @@ -1209,7 +1222,8 @@ static void print_stats(args_t *args) for (i=1; im_af; i++) // note that af[1] now contains also af[0], see SiS stats output above { if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue; - printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], + double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); + printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]); } } @@ -1266,34 +1280,56 @@ static void print_stats(args_t *args) printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); int x; - for (x=0; x<2; x++) + for (x=0; x<2; x++) // x=0: snps, x=1: indels { gtcmp_t *stats; if ( x==0 ) { - printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_snps; } else { - printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_indels; } - uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0}; + uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins for (i=0; im_af; i++) { - int j, n = 0; - for (j=0; j<3; j++) + int n = 0; + uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin + for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./. + for (k=0; k<4; k++) + { + n += stats[i].gt2gt[j][k]; + if ( j==k ) + { + nrd_m[j] += stats[i].gt2gt[j][k]; + m[j] += stats[i].gt2gt[j][k]; + } + else + { + nrd_mm[j] += stats[i].gt2gt[j][k]; + mm[j] += stats[i].gt2gt[j][k]; + } + } + if ( !i || !n ) continue; // skip singleton stats and empty bins + + // Pearson's r2 + double r2 = 0; + if ( stats[i].n ) { - n += stats[i].m[j] + stats[i].mm[j]; - nrd_m[j] += stats[i].m[j]; - nrd_mm[j] += stats[i].mm[j]; + r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n); + r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); + r2 *= r2; } - if ( !i || !n ) continue; // skip singleton stats and empty bins - printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1)); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); - printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n); + double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); + printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); + printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); + printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); + if ( stats[i].n && !isnan(r2) ) printf("\t%f", r2); + else printf("\t"NA_STRING); + printf("\t%.0f\n", stats[i].n); } if ( x==0 ) @@ -1309,8 +1345,8 @@ static void print_stats(args_t *args) } else printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); - uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)]; - uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)]; + uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)]; + uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)]; printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', m+mm ? mm*100.0/(m+mm) : 0, nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0, @@ -1319,42 +1355,99 @@ static void print_stats(args_t *args) ); } - for (x=0; x<2; x++) + for (x=0; x<2; x++) // x=0: snps, x=1: indels { gtcmp_t *stats; - smpl_r_t *smpl_r_array; if ( x==0 ) { printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_snps; - smpl_r_array = args->smpl_r_snps; } else { printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_indels; - smpl_r_array = args->smpl_r_indels; } for (i=0; ifiles->n_smpl; i++) { - uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)]; - uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)]; - // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar - smpl_r_t *smpl_r = smpl_r_array + i; - double r = 0.0; - if (smpl_r->n) { - double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula - double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n; - double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n; - r = (sum_crossprod)/sqrt(x2_xx*y2_yy); + uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]; + for (j=0; j<3; j++) + for (k=0; k<3; k++) + if ( j!=k ) mm += stats[i].gt2gt[j][k]; + + // Pearson's r2 + double r2 = 0; + if ( stats[i].n ) + { + r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n); + r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); + r2 *= r2; } printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); - if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r); + printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); + printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); + if ( stats[i].n && !isnan(r2) ) printf("\t%f\n", r2); else printf("\t"NA_STRING"\n"); } } + for (x=0; x<2; x++) // x=0: snps, x=1: indels + { + //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); + + gtcmp_t *stats; + if ( x==0 ) + { + printf("# GCTs, Genotype concordance table (SNPs)\n# GCTs"); + stats = args->smpl_gts_snps; + } + else + { + printf("# GCTi, Genotype concordance table (indels)\n# GCTi"); + stats = args->smpl_gts_indels; + } + i = 1; + printf("\t[%d]sample", ++i); + printf("\t[%d]RR Hom -> RR Hom", ++i); + printf("\t[%d]RR Hom -> RA Het", ++i); + printf("\t[%d]RR Hom -> AA Hom", ++i); + printf("\t[%d]RR Hom -> AA Het", ++i); + printf("\t[%d]RR Hom -> missing", ++i); + printf("\t[%d]RA Het -> RR Hom", ++i); + printf("\t[%d]RA Het -> RA Het", ++i); + printf("\t[%d]RA Het -> AA Hom", ++i); + printf("\t[%d]RA Het -> AA Het", ++i); + printf("\t[%d]RA Het -> missing", ++i); + printf("\t[%d]AA Hom -> RR Hom", ++i); + printf("\t[%d]AA Hom -> RA Het", ++i); + printf("\t[%d]AA Hom -> AA Hom", ++i); + printf("\t[%d]AA Hom -> AA Het", ++i); + printf("\t[%d]AA Hom -> missing", ++i); + printf("\t[%d]AA Het -> RR Hom", ++i); + printf("\t[%d]AA Het -> RA Het", ++i); + printf("\t[%d]AA Het -> AA Hom", ++i); + printf("\t[%d]AA Het -> AA Het", ++i); + printf("\t[%d]AA Het -> missing", ++i); + printf("\t[%d]missing -> RR Hom", ++i); + printf("\t[%d]missing -> RA Het", ++i); + printf("\t[%d]missing -> AA Hom", ++i); + printf("\t[%d]missing -> AA Het", ++i); + printf("\t[%d]missing -> missing\n", ++i); + + for (i=0; ifiles->n_smpl; i++) + { + printf("GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); + for (j=0; j<5; j++) + for (k=0; k<5; k++) + printf("\t%"PRId64, stats[i].gt2gt[j][k]); + printf("\n"); + } + } } printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); @@ -1423,8 +1516,10 @@ static void print_stats(args_t *args) for (j=0; jnaf_hwe; j++) sum_tot += ptr[j]; if ( !sum_tot ) continue; + double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); + int nprn = 3; - printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot); + printf("HWE\t%d\t%f\t%d",id,af,sum_tot); for (j=0; jnaf_hwe; j++) { sum_tmp += ptr[j]; @@ -1462,6 +1557,8 @@ static void usage(void) fprintf(stderr, "Usage: bcftools stats [options] []\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); + fprintf(stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); + fprintf(stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); fprintf(stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); fprintf(stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); @@ -1478,6 +1575,7 @@ static void usage(void) fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(stderr, " --threads number of extra decompression threads [0]\n"); fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); fprintf(stderr, "\n"); exit(1); @@ -1494,6 +1592,8 @@ int main_vcfstats(int argc, char *argv[]) static struct option loptions[] = { + {"af-bins",1,0,1}, + {"af-tag",1,0,2}, {"1st-allele-only",0,0,'1'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, @@ -1512,10 +1612,13 @@ int main_vcfstats(int argc, char *argv[]) {"targets-file",1,0,'T'}, {"fasta-ref",1,0,'F'}, {"user-tstv",1,0,'u'}, + {"threads",1,0,9}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) { switch (c) { + case 1 : args->af_bins_list = optarg; break; + case 2 : args->af_tag = optarg; break; case 'u': add_user_stats(args,optarg); break; case '1': args->first_allele_only = 1; break; case 'F': args->ref_fname = optarg; break; @@ -1547,6 +1650,7 @@ int main_vcfstats(int argc, char *argv[]) case 'I': args->split_by_id = 1; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); @@ -1571,6 +1675,9 @@ int main_vcfstats(int argc, char *argv[]) error("Failed to read the targets: %s\n", args->targets_list); if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); + if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) + error("Failed to create threads\n"); + while (fname) { if ( !bcf_sr_add_reader(args->files, fname) ) diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 5653760..a5e5a9f 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -2,7 +2,7 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2015 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -41,6 +41,7 @@ THE SOFTWARE. */ #include #include "bcftools.h" #include "filter.h" +#include "bin.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -69,17 +70,6 @@ typedef struct } idist_t; -typedef struct -{ - double x; - double x2; - double y; - double y2; - double xy; - double n; -} -smpl_r_t; - typedef struct { int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; @@ -110,9 +100,14 @@ stats_t; typedef struct { - uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches - float r2sum; - uint32_t r2n; + uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats + /* + Pearson's R^2 is used for aggregate R^2 + y, yy .. sum of dosage and squared dosage in the query VCF (second file) + x, xx .. sum of squared dosage in the truth VCF (first file) + n .. number of genotypes + */ + double y, yy, x, xx, yx, n; } gtcmp_t; @@ -137,7 +132,11 @@ typedef struct int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm; uint8_t *tmp_frm; int dp_min, dp_max, dp_step; - gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons + gtcmp_t *smpl_gts_snps, *smpl_gts_indels; + gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons + bin_t *af_bins; + float *farr; + int mfarr; // indel context indel_ctx_t *indel_ctx; @@ -150,21 +149,18 @@ typedef struct // other bcf_srs_t *files; bcf_sr_regions_t *exons; - char **argv, *exons_fname, *regions_list, *samples_list, *targets_list; + char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag; int argc, verbose_sites, first_allele_only, samples_is_file; int split_by_id, nstats; filter_t *filter[2]; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE - - // Per Sample r working data arrays of size equal to number of samples - smpl_r_t* smpl_r_snps; - smpl_r_t* smpl_r_indels; + int n_threads; } args_t; -static int type2dosage[6], type2ploidy[6], type2stats[6]; +static int type2dosage[6], type2ploidy[6], type2stats[7]; static void idist_init(idist_t *d, int min, int max, int step) { @@ -189,6 +185,12 @@ static inline int idist_i2bin(idist_t *d, int i) return i-1+d->min; } +static inline int clip_nonnegative(float x, int limit) +{ + if (x >= limit || isnan(x)) return limit - 1; + else if (x <= 0.0) return 0; + else return (int) x; +} #define IC_DBG 0 #if IC_DBG @@ -405,13 +407,30 @@ static void init_stats(args_t *args) args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str); if ( args->files->nreaders==2 ) args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str); + args->files->max_unpack |= filter_max_unpack(args->filter[0]); + } + + // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs + if ( !args->af_bins_list ) + { + args->m_af = 101; + for (i=0; ifiles->nreaders; i++) + if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af ) + args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1; + } + else + { + args->af_bins = bin_init(args->af_bins_list,0,1); + + // m_af is used also for other af arrays, where the first bin is for + // singletons. However, since the last element is unused in af_bins + // (n boundaries form n-1 intervals), the m_af count is good for both. + args->m_af = bin_get_size(args->af_bins); } - // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs - args->m_af = 101; - for (i=0; ifiles->nreaders; i++) - if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af ) - args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1; + bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); + if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) ) + error("No such INFO tag: %s\n", args->af_tag); #if QUAL_STATS args->m_qual = 999; @@ -432,8 +451,6 @@ static void init_stats(args_t *args) args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t)); args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t)); args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t)); - args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t)); - args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t)); } for (i=0; instats; i++) { @@ -505,9 +522,10 @@ static void init_stats(args_t *args) type2stats[GT_HOM_RR] = 0; type2stats[GT_HET_RA] = 1; type2stats[GT_HOM_AA] = 2; - type2stats[GT_HET_AA] = 1; + type2stats[GT_HET_AA] = 3; type2stats[GT_HAPL_R] = 0; type2stats[GT_HAPL_A] = 2; + type2stats[GT_UNKN] = 4; } static void destroy_stats(args_t *args) @@ -528,7 +546,6 @@ static void destroy_stats(args_t *args) if (stats->qual_indels) free(stats->qual_indels); #endif #if HWE_STATS - //if ( args->files->n_smpl ) free(stats->af_hwe); free(stats->af_hwe); #endif free(stats->insertions); @@ -556,6 +573,8 @@ static void destroy_stats(args_t *args) if ( args->exons ) free(stats->smpl_frm_shifts); } for (j=0; jnusr; j++) free(args->usr[j].tag); + if ( args->af_bins ) bin_destroy(args->af_bins); + free(args->farr); free(args->usr); free(args->tmp_frm); free(args->tmp_iaf); @@ -564,8 +583,6 @@ static void destroy_stats(args_t *args) free(args->af_gts_indels); free(args->smpl_gts_snps); free(args->smpl_gts_indels); - free(args->smpl_r_snps); - free(args->smpl_r_indels); if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx); if (args->filter[0]) filter_destroy(args->filter[0]); if (args->filter[1]) filter_destroy(args->filter[1]); @@ -574,36 +591,59 @@ static void destroy_stats(args_t *args) static void init_iaf(args_t *args, bcf_sr_t *reader) { bcf1_t *line = reader->buffer[0]; - if ( args->ntmp_iaf < line->n_allele ) + hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf); + + int i, ret; + if ( args->af_tag ) { - args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int)); - args->ntmp_iaf = line->n_allele; + ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr); + if ( ret<=0 || ret!=line->n_allele-1 ) + { + // the AF tag is not present or wrong number of values, put in the singletons/unknown bin + for (i=0; in_allele; i++) args->tmp_iaf[i] = 0; + return; + } + args->tmp_iaf[0] = 0; + for (i=1; in_allele; i++) + { + float af = args->farr[i-1]; + if ( af<0 ) af = 0; + else if ( af>1 ) af = 1; + int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2); + args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons + } + return; } + // tmp_iaf is first filled with AC counts in calc_ac and then transformed to // an index to af_gts_snps - int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO); - if ( ret ) + ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO); + if ( !ret ) { - int an=0; - for (i=0; in_allele; i++) - an += args->tmp_iaf[i]; + for (i=0; in_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin + return; + } - args->tmp_iaf[0] = 0; - for (i=1; in_allele; i++) + int an = 0; + for (i=0; in_allele; i++) + an += args->tmp_iaf[i]; + + args->tmp_iaf[0] = 0; + for (i=1; in_allele; i++) + { + if ( args->tmp_iaf[i]==1 ) + args->tmp_iaf[i] = 0; // singletons into the first bin + else if ( !an ) + args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin + else { - if ( args->tmp_iaf[i]==1 ) - args->tmp_iaf[i] = 0; // singletons into the first bin - else if ( !an ) - args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin - else - args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an; + float af = (float) args->tmp_iaf[i] / an; + if ( af<0 ) af = 0; + else if ( af>1 ) af = 1; + int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2); + args->tmp_iaf[i] = iaf + 1; } } - else - for (i=0; in_allele; i++) - args->tmp_iaf[i] = 0; - - // todo: otherwise use AF } static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) @@ -623,7 +663,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) bcf1_t *line = reader->buffer[0]; #if QUAL_STATS - int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual; + int iqual = clip_nonnegative(line->qual, args->m_qual); stats->qual_indels[iqual]++; #endif @@ -758,7 +798,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) if ( ref<0 ) return; #if QUAL_STATS - int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual; + int iqual = clip_nonnegative(line->qual, args->m_qual); stats->qual_snps[iqual]++; #endif @@ -875,6 +915,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int { float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot); int idx = het_frac*(args->naf_hwe - 1); +//check me: what is this? if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1]; stats->af_hwe[idx]++; } @@ -913,88 +954,42 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; // only the first ALT allele is considered - int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1; + int iaf = args->tmp_iaf[1]; int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels; - // - // Calculates r squared - // x is mean dosage of x at given site - // x2 is mean squared dosage of x at given site - // y is mean dosage of x at given site - // y2 is mean squared dosage of x at given site - // xy is mean dosage of x*y at given site - // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) ) - // r2n is number of sites considered - // output as r2sum/r2n for each AF bin - int r2n = 0; - float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0; - // Select smpl_r - smpl_r_t *smpl_r = NULL; - if (line_type&VCF_SNP) - { - smpl_r = args->smpl_r_snps; - } - else if (line_type&VCF_INDEL) - { - smpl_r = args->smpl_r_indels; - } for (is=0; isn_smpl; is++) { // Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of // actual alleles can be enforced by running without the -c option. int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL); - if ( gt0 == GT_UNKN ) continue; - int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL); - if ( gt1 == GT_UNKN ) continue; - if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes + int idx0 = type2stats[gt0]; + int idx1 = type2stats[gt1]; + af_stats[iaf].gt2gt[idx0][idx1]++; + smpl_stats[is].gt2gt[idx0][idx1]++; - int dsg0 = type2dosage[gt0]; - int dsg1 = type2dosage[gt1]; - x += dsg0; - x2 += dsg0*dsg0; - y += dsg1; - y2 += dsg1*dsg1; - xy += dsg0*dsg1; - r2n++; - - int idx = type2stats[gt0]; - if ( gt0==gt1 ) - { - af_stats[iaf].m[idx]++; - smpl_stats[is].m[idx]++; - } - else - { - af_stats[iaf].mm[idx]++; - smpl_stats[is].mm[idx]++; - } - - // Now do it across samples + if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue; + if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes - if (smpl_r) { - smpl_r[is].xy += dsg0*dsg1; - smpl_r[is].x += dsg0; - smpl_r[is].x2 += dsg0*dsg0; - smpl_r[is].y += dsg1; - smpl_r[is].y2 += dsg1*dsg1; - ++(smpl_r[is].n); - } - } - - if ( r2n ) - { - x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n; - float cov = xy - x*y; - float var2 = (x2 - x*x) * (y2 - y*y); - if ( var2!=0 ) - { - af_stats[iaf].r2sum += cov*cov/var2; - af_stats[iaf].r2n++; - } + float y = type2dosage[gt0]; + float x = type2dosage[gt1]; + + smpl_stats[is].yx += y*x; + smpl_stats[is].x += x; + smpl_stats[is].xx += x*x; + smpl_stats[is].y += y; + smpl_stats[is].yy += y*y; + smpl_stats[is].n += 1; + + af_stats[iaf].yx += y*x; + af_stats[iaf].x += x; + af_stats[iaf].xx += x*x; + af_stats[iaf].y += y; + af_stats[iaf].yy += y*y; + af_stats[iaf].n += 1; } if ( args->verbose_sites ) @@ -1131,7 +1126,7 @@ static void print_header(args_t *args) #define T2S(x) type2stats[x] static void print_stats(args_t *args) { - int i, id; + int i, j,k, id; fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); @@ -1204,6 +1199,24 @@ static void print_stats(args_t *args) stats->af_repeats[1][1] += stats->af_repeats[1][0]; stats->af_repeats[2][1] += stats->af_repeats[2][0]; } + // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf + if ( args->af_gts_snps ) + { + args->af_gts_snps[1].y += args->af_gts_snps[0].y; + args->af_gts_snps[1].yy += args->af_gts_snps[0].yy; + args->af_gts_snps[1].xx += args->af_gts_snps[0].xx; + args->af_gts_snps[1].yx += args->af_gts_snps[0].yx; + args->af_gts_snps[1].n += args->af_gts_snps[0].n; + } + if ( args->af_gts_indels ) + { + args->af_gts_indels[1].y += args->af_gts_indels[0].y; + args->af_gts_indels[1].yy += args->af_gts_indels[0].yy; + args->af_gts_indels[1].xx += args->af_gts_indels[0].xx; + args->af_gts_indels[1].yx += args->af_gts_indels[0].yx; + args->af_gts_indels[1].n += args->af_gts_indels[0].n; + } + fprintf(pysam_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { @@ -1211,7 +1224,8 @@ static void print_stats(args_t *args) for (i=1; im_af; i++) // note that af[1] now contains also af[0], see SiS stats output above { if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue; - fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], + double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); + fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]); } } @@ -1268,34 +1282,56 @@ static void print_stats(args_t *args) fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); int x; - for (x=0; x<2; x++) + for (x=0; x<2; x++) // x=0: snps, x=1: indels { gtcmp_t *stats; if ( x==0 ) { - fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_snps; } else { - fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_indels; } - uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0}; + uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins for (i=0; im_af; i++) { - int j, n = 0; - for (j=0; j<3; j++) + int n = 0; + uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin + for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./. + for (k=0; k<4; k++) + { + n += stats[i].gt2gt[j][k]; + if ( j==k ) + { + nrd_m[j] += stats[i].gt2gt[j][k]; + m[j] += stats[i].gt2gt[j][k]; + } + else + { + nrd_mm[j] += stats[i].gt2gt[j][k]; + mm[j] += stats[i].gt2gt[j][k]; + } + } + if ( !i || !n ) continue; // skip singleton stats and empty bins + + // Pearson's r2 + double r2 = 0; + if ( stats[i].n ) { - n += stats[i].m[j] + stats[i].mm[j]; - nrd_m[j] += stats[i].m[j]; - nrd_mm[j] += stats[i].mm[j]; + r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n); + r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); + r2 *= r2; } - if ( !i || !n ) continue; // skip singleton stats and empty bins - fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1)); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); - fprintf(pysam_stdout, "\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n); + double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); + fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); + if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f", r2); + else fprintf(pysam_stdout, "\t"NA_STRING); + fprintf(pysam_stdout, "\t%.0f\n", stats[i].n); } if ( x==0 ) @@ -1311,8 +1347,8 @@ static void print_stats(args_t *args) } else fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); - uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)]; - uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)]; + uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)]; + uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)]; fprintf(pysam_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', m+mm ? mm*100.0/(m+mm) : 0, nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0, @@ -1321,42 +1357,99 @@ static void print_stats(args_t *args) ); } - for (x=0; x<2; x++) + for (x=0; x<2; x++) // x=0: snps, x=1: indels { gtcmp_t *stats; - smpl_r_t *smpl_r_array; if ( x==0 ) { fprintf(pysam_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_snps; - smpl_r_array = args->smpl_r_snps; } else { fprintf(pysam_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_indels; - smpl_r_array = args->smpl_r_indels; } for (i=0; ifiles->n_smpl; i++) { - uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)]; - uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)]; - // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar - smpl_r_t *smpl_r = smpl_r_array + i; - double r = 0.0; - if (smpl_r->n) { - double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula - double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n; - double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n; - r = (sum_crossprod)/sqrt(x2_xx*y2_yy); + uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]; + for (j=0; j<3; j++) + for (k=0; k<3; k++) + if ( j!=k ) mm += stats[i].gt2gt[j][k]; + + // Pearson's r2 + double r2 = 0; + if ( stats[i].n ) + { + r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n); + r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); + r2 *= r2; } fprintf(pysam_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); - if (smpl_r->n && !isnan(r)) fprintf(pysam_stdout, "\t%f\n", r*r); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); + if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f\n", r2); else fprintf(pysam_stdout, "\t"NA_STRING"\n"); } } + for (x=0; x<2; x++) // x=0: snps, x=1: indels + { + //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); + + gtcmp_t *stats; + if ( x==0 ) + { + fprintf(pysam_stdout, "# GCTs, Genotype concordance table (SNPs)\n# GCTs"); + stats = args->smpl_gts_snps; + } + else + { + fprintf(pysam_stdout, "# GCTi, Genotype concordance table (indels)\n# GCTi"); + stats = args->smpl_gts_indels; + } + i = 1; + fprintf(pysam_stdout, "\t[%d]sample", ++i); + fprintf(pysam_stdout, "\t[%d]RR Hom -> RR Hom", ++i); + fprintf(pysam_stdout, "\t[%d]RR Hom -> RA Het", ++i); + fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Hom", ++i); + fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Het", ++i); + fprintf(pysam_stdout, "\t[%d]RR Hom -> missing", ++i); + fprintf(pysam_stdout, "\t[%d]RA Het -> RR Hom", ++i); + fprintf(pysam_stdout, "\t[%d]RA Het -> RA Het", ++i); + fprintf(pysam_stdout, "\t[%d]RA Het -> AA Hom", ++i); + fprintf(pysam_stdout, "\t[%d]RA Het -> AA Het", ++i); + fprintf(pysam_stdout, "\t[%d]RA Het -> missing", ++i); + fprintf(pysam_stdout, "\t[%d]AA Hom -> RR Hom", ++i); + fprintf(pysam_stdout, "\t[%d]AA Hom -> RA Het", ++i); + fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Hom", ++i); + fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Het", ++i); + fprintf(pysam_stdout, "\t[%d]AA Hom -> missing", ++i); + fprintf(pysam_stdout, "\t[%d]AA Het -> RR Hom", ++i); + fprintf(pysam_stdout, "\t[%d]AA Het -> RA Het", ++i); + fprintf(pysam_stdout, "\t[%d]AA Het -> AA Hom", ++i); + fprintf(pysam_stdout, "\t[%d]AA Het -> AA Het", ++i); + fprintf(pysam_stdout, "\t[%d]AA Het -> missing", ++i); + fprintf(pysam_stdout, "\t[%d]missing -> RR Hom", ++i); + fprintf(pysam_stdout, "\t[%d]missing -> RA Het", ++i); + fprintf(pysam_stdout, "\t[%d]missing -> AA Hom", ++i); + fprintf(pysam_stdout, "\t[%d]missing -> AA Het", ++i); + fprintf(pysam_stdout, "\t[%d]missing -> missing\n", ++i); + + for (i=0; ifiles->n_smpl; i++) + { + fprintf(pysam_stdout, "GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); + for (j=0; j<5; j++) + for (k=0; k<5; k++) + fprintf(pysam_stdout, "\t%"PRId64, stats[i].gt2gt[j][k]); + fprintf(pysam_stdout, "\n"); + } + } } fprintf(pysam_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); @@ -1425,8 +1518,10 @@ static void print_stats(args_t *args) for (j=0; jnaf_hwe; j++) sum_tot += ptr[j]; if ( !sum_tot ) continue; + double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); + int nprn = 3; - fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot); + fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,af,sum_tot); for (j=0; jnaf_hwe; j++) { sum_tmp += ptr[j]; @@ -1464,6 +1559,8 @@ static void usage(void) fprintf(pysam_stderr, "Usage: bcftools stats [options] []\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); + fprintf(pysam_stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); fprintf(pysam_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); fprintf(pysam_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); fprintf(pysam_stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); @@ -1480,6 +1577,7 @@ static void usage(void) fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); fprintf(pysam_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(pysam_stderr, " --threads number of extra decompression threads [0]\n"); fprintf(pysam_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); fprintf(pysam_stderr, "\n"); exit(1); @@ -1496,6 +1594,8 @@ int main_vcfstats(int argc, char *argv[]) static struct option loptions[] = { + {"af-bins",1,0,1}, + {"af-tag",1,0,2}, {"1st-allele-only",0,0,'1'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, @@ -1514,10 +1614,13 @@ int main_vcfstats(int argc, char *argv[]) {"targets-file",1,0,'T'}, {"fasta-ref",1,0,'F'}, {"user-tstv",1,0,'u'}, + {"threads",1,0,9}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) { switch (c) { + case 1 : args->af_bins_list = optarg; break; + case 2 : args->af_tag = optarg; break; case 'u': add_user_stats(args,optarg); break; case '1': args->first_allele_only = 1; break; case 'F': args->ref_fname = optarg; break; @@ -1549,6 +1652,7 @@ int main_vcfstats(int argc, char *argv[]) case 'I': args->split_by_id = 1; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); @@ -1573,6 +1677,9 @@ int main_vcfstats(int argc, char *argv[]) error("Failed to read the targets: %s\n", args->targets_list); if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); + if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) + error("Failed to create threads\n"); + while (fname) { if ( !bcf_sr_add_reader(args->files, fname) ) diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index c14075d..645cc8a 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -181,10 +182,12 @@ static void init_data(args_t *args) if (args->include_types) { args->include = 0; for (i = 0; i < n; ++i) { - if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP; - else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL; - else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP; - else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER; + if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1; + else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1; + else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1; + else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); @@ -195,10 +198,12 @@ static void init_data(args_t *args) if (args->exclude_types) { args->exclude = 0; for (i = 0; i < n; ++i) { - if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP; - else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL; - else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP; - else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER; + if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1; + else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1; + else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1; + else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); @@ -220,7 +225,8 @@ static void init_data(args_t *args) else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out, args->n_threads); + if ( args->n_threads > 0) + hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); // headers: hdr=full header, hsub=subset header, hnull=sites only header if (args->sites_only){ @@ -315,8 +321,8 @@ int subset_vcf(args_t *args, bcf1_t *line) if (args->include || args->exclude) { int line_type = bcf_get_variant_types(line); - if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types - if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types + if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types + if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types } if ( args->filter ) @@ -398,7 +404,7 @@ int subset_vcf(args_t *args, bcf1_t *line) } } - if (args->min_ac) + if (args->min_ac!=-1) { if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC @@ -406,7 +412,7 @@ int subset_vcf(args_t *args, bcf1_t *line) else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC } - if (args->max_ac) + if (args->max_ac!=-1) { if (args->max_ac_type == ALLELE_NONREF && args->max_acmax_ac_type == ALLELE_MINOR && args->max_acmax_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC } - if (args->min_af) + if (args->min_af!=-1) { if (an == 0) return 0; // freq not defined, skip site if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF @@ -423,7 +429,7 @@ int subset_vcf(args_t *args, bcf1_t *line) else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF } - if (args->max_af) + if (args->max_af!=-1) { if (an == 0) return 0; // freq not defined, skip site if (args->max_af_type == ALLELE_NONREF && args->max_aftrim_alts) { int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); - if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); + if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); } if (args->phased) { int phased = bcf_all_phased(args->hdr, line); @@ -494,7 +500,7 @@ static void usage(args_t *args) fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); fprintf(stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(stderr, " --threads number of extra (de)compression threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Subset options:\n"); fprintf(stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); @@ -515,7 +521,7 @@ static void usage(args_t *args) fprintf(stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); - fprintf(stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n"); + fprintf(stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); fprintf(stderr, "\n"); exit(1); @@ -533,6 +539,7 @@ int main_vcfview(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->min_ac = args->max_ac = args->min_af = args->max_af = -1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -726,6 +733,7 @@ int main_vcfview(int argc, char *argv[]) error("Failed to read the targets: %s\n", args->targets_list); } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); @@ -734,6 +742,8 @@ int main_vcfview(int argc, char *argv[]) bcf_hdr_write(args->out, out_hdr); else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); + + int ret = 0; if (!args->header_only) { while ( bcf_sr_next_line(args->files) ) @@ -743,10 +753,12 @@ int main_vcfview(int argc, char *argv[]) if ( subset_vcf(args, line) ) bcf_write1(args->out, out_hdr, line); } + ret = args->files->errnum; + if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } hts_close(args->out); destroy_data(args); bcf_sr_destroy(args->files); free(args); - return 0; + return ret; } diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 53b7c53..a471f37 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -25,6 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -183,10 +184,12 @@ static void init_data(args_t *args) if (args->include_types) { args->include = 0; for (i = 0; i < n; ++i) { - if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP; - else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL; - else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP; - else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER; + if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1; + else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1; + else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1; + else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1; else { fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n"); @@ -197,10 +200,12 @@ static void init_data(args_t *args) if (args->exclude_types) { args->exclude = 0; for (i = 0; i < n; ++i) { - if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP; - else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL; - else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP; - else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER; + if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1; + else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1; + else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1; + else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1; + else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1; else { fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n"); @@ -222,7 +227,8 @@ static void init_data(args_t *args) else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out, args->n_threads); + if ( args->n_threads > 0) + hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); // headers: hdr=full header, hsub=subset header, hnull=sites only header if (args->sites_only){ @@ -317,8 +323,8 @@ int subset_vcf(args_t *args, bcf1_t *line) if (args->include || args->exclude) { int line_type = bcf_get_variant_types(line); - if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types - if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types + if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types + if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types } if ( args->filter ) @@ -400,7 +406,7 @@ int subset_vcf(args_t *args, bcf1_t *line) } } - if (args->min_ac) + if (args->min_ac!=-1) { if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC @@ -408,7 +414,7 @@ int subset_vcf(args_t *args, bcf1_t *line) else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC } - if (args->max_ac) + if (args->max_ac!=-1) { if (args->max_ac_type == ALLELE_NONREF && args->max_acmax_ac_type == ALLELE_MINOR && args->max_acmax_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC } - if (args->min_af) + if (args->min_af!=-1) { if (an == 0) return 0; // freq not defined, skip site if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF @@ -425,7 +431,7 @@ int subset_vcf(args_t *args, bcf1_t *line) else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF } - if (args->max_af) + if (args->max_af!=-1) { if (an == 0) return 0; // freq not defined, skip site if (args->max_af_type == ALLELE_NONREF && args->max_aftrim_alts) { int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); - if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); + if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); } if (args->phased) { int phased = bcf_all_phased(args->hdr, line); @@ -496,7 +502,7 @@ static void usage(args_t *args) fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(pysam_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); fprintf(pysam_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, " --threads number of extra (de)compression threads [0]\n"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "Subset options:\n"); fprintf(pysam_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); @@ -517,7 +523,7 @@ static void usage(args_t *args) fprintf(pysam_stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); fprintf(pysam_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); - fprintf(pysam_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n"); + fprintf(pysam_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(pysam_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); fprintf(pysam_stderr, "\n"); exit(1); @@ -535,6 +541,7 @@ int main_vcfview(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->min_ac = args->max_ac = args->min_af = args->max_af = -1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -728,6 +735,7 @@ int main_vcfview(int argc, char *argv[]) error("Failed to read the targets: %s\n", args->targets_list); } + if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); @@ -736,6 +744,8 @@ int main_vcfview(int argc, char *argv[]) bcf_hdr_write(args->out, out_hdr); else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); + + int ret = 0; if (!args->header_only) { while ( bcf_sr_next_line(args->files) ) @@ -745,10 +755,12 @@ int main_vcfview(int argc, char *argv[]) if ( subset_vcf(args, line) ) bcf_write1(args->out, out_hdr, line); } + ret = args->files->errnum; + if ( ret ) fprintf(pysam_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } hts_close(args->out); destroy_data(args); bcf_sr_destroy(args->files); free(args); - return 0; + return ret; } diff --git a/bcftools/version.h b/bcftools/version.h index 05929f5..84247e7 100644 --- a/bcftools/version.h +++ b/bcftools/version.h @@ -1 +1 @@ -#define BCFTOOLS_VERSION "1.3.1" +#define BCFTOOLS_VERSION "1.4.1" diff --git a/buildwheels.sh b/buildwheels.sh index a5987f1..ae0d953 100755 --- a/buildwheels.sh +++ b/buildwheels.sh @@ -22,7 +22,7 @@ if ! grep -q docker /proc/1/cgroup; then exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0 fi -yum install -y zlib-devel +yum install -y zlib-devel bzip2-devel xz-devel # Python 2.6 is not supported rm -r /opt/python/cp26* diff --git a/doc/api.rst b/doc/api.rst index 686c60d..8e76686 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -88,11 +88,11 @@ The above code outputs:: Commands available in :term:`csamtools` are available as simple function calls. For example:: - pysam.sort("ex1.bam", "output") + pysam.sort("-o", "output.bam", "ex1.bam") corresponds to the command line:: - samtools sort ex1.bam output + samtools sort -o output.bam ex1.bam Analogous to :class:`~pysam.AlignmentFile`, a :class:`~pysam.TabixFile` allows fast random access to compressed and diff --git a/doc/release.rst b/doc/release.rst index 1d378f3..3874856 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,81 @@ Release notes ============= +Release 0.11.2.2 +================ + +Bugfix release to address two issues: + +* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and + more tests have been added. +* [#479] Correct VariantRecord edge cases described in issue + + +Release 0.11.2.1 +================ + +Release to fix release tar-ball containing 0.11.1 pre-compiled +C-files. + + +Release 0.11.2 +============== + +This release wraps htslib/samtools/bcfools versions 1.4.1 in response +to a security fix in these libraries. Additionaly the following +issues have been fixed: + +* [#452] add GFF3 support for tabix parsers +* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END +* [#447] limit query name to 251 characters (only partially addresses issue) + +VariantFile and related object fixes + +* Restore VariantFile.\_\_dealloc\_\_ +* Correct handling of bcf_str_missing in bcf_array_to_object and + bcf_object_to_array +* Added update() and pop() methods to some dict-like proxy objects +* scalar INFO entries could not be set again after being deleted +* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without + raising a KeyError +* Multiple other fixes for VariantRecordInfo methods +* INFO/END is now accessible only via VariantRecord.stop and + VariantRecord.rlen. Even if present behind the scenes, it is no longer + accessible via VariantRecordInfo. +* Add argument to issue a warning instead of an exception if input appears + to be truncated + +Other features and fixes: + +* Make AlignmentFile \_\_dealloc\_\_ and close more + stringent +* Add argument AlignmentFile to issue a warning instead of an + exception if input appears to be truncated + +Release 0.11.1 +============== + +Bugfix release + +* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility. + +Release 0.11.0 +============== + +This release wraps the latest versions of htslib/samtools/bcftools and +implements a few bugfixes. + +* [#413] Wrap HTSlib/Samtools/BCFtools 1.4 +* [#422] Fix missing pysam.sort.usage() message +* [#411] Fix BGZfile initialization bug +* [#412] Add seek support for BGZFile +* [#395] Make BGZfile iterable +* [#433] Correct getQueryEnd +* [#419] Export SAM enums such as pysam.CMATCH +* [#415] Fix access by tid in AlignmentFile.fetch() +* [#405] Writing SAM now outputs a header by default. +* [#332] split infer_query_length(always) into infer_query_length and infer_read_length + Release 0.10.0 ============== diff --git a/doc/usage.rst b/doc/usage.rst index 936f3bd..6172329 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -123,26 +123,23 @@ Note that the file open mode needs to changed from ``r`` to ``rb``. Using samtools commands within python ===================================== -Commands available in :term:`csamtools` are available -as simple function calls. For example:: +Commands available in :term:`csamtools` are available as simple +function calls. Command line options are provided as arguments. For +example:: - pysam.sort("ex1.bam", "output") + pysam.sort("-o", "output.bam", "ex1.bam") corresponds to the command line:: - samtools sort ex1.bam output + samtools sort -o output.bam ex1.bam -Command line options can be provided as arguments:: - - pysam.sort("-n", "ex1.bam", "output") - -or:: +Or for example:: - pysam.sort("-m", "1000000", "ex1.bam", "output") + pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam") In order to get usage information, try:: - print pysam.sort.usage() + print(pysam.sort.usage()) Argument errors raise a :class:`pysam.SamtoolsError`:: diff --git a/import.py b/import.py index 12d2016..b8eab01 100644 --- a/import.py +++ b/import.py @@ -31,10 +31,22 @@ import hashlib EXCLUDE = { "samtools": ( - "razip.c", "bgzip.c", "main.c", - "calDepth.c", "bam2bed.c", "wgsim.c", - "md5fa.c", "md5sum-lite.c", "maq2sam.c", - "bamcheck.c", "chk_indel.c", "vcf-miniview.c", + "razip.c", + "bgzip.c", + "main.c", + "calDepth.c", + "bam2bed.c", + "wgsim.c", + "bam_tview.c", + "bam_tview.h", + "bam_tview_html.c", + "bam_tview_curses.c", + "md5fa.c", + "md5sum-lite.c", + "maq2sam.c", + "bamcheck.c", + "chk_indel.c", + "vcf-miniview.c", "htslib-1.3", # do not import twice "hfile_irods.c", # requires irods library ), @@ -73,9 +85,10 @@ def _update_pysam_files(cf, destdir): if not filename: continue dest = filename + ".pysam.c" - with open(filename) as infile: + with open(filename, encoding="utf-8") as infile: lines = "".join(infile.readlines()) - with open(dest, "w") as outfile: + + with open(dest, "w", encoding="utf-8") as outfile: outfile.write('#include "pysam.h"\n\n') subname, _ = os.path.splitext(os.path.basename(filename)) if subname in MAIN.get(basename, []): @@ -161,9 +174,9 @@ if len(sys.argv) >= 1: old_file = os.path.join(targetdir, f) if os.path.exists(old_file): md5_old = hashlib.md5( - "".join(open(old_file, "r").readlines())).digest() + "".join(open(old_file, "r", encoding="utf-8").readlines()).encode()).digest() md5_new = hashlib.md5( - "".join(open(src, "r").readlines())).digest() + "".join(open(src, "r", encoding="utf-8").readlines()).encode()).digest() if md5_old != md5_new: raise ValueError( "incompatible files for %s and %s" % diff --git a/pysam/__init__.py b/pysam/__init__.py index ed17e04..c142c6c 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -3,6 +3,8 @@ import sys import sysconfig from pysam.libchtslib import * +from pysam.libcsamtools import * +from pysam.libcbcftools import * from pysam.libcutils import * import pysam.libcutils as libcutils import pysam.libcfaidx as libcfaidx diff --git a/pysam/cbcftools_util.h b/pysam/cbcftools_util.h new file mode 100644 index 0000000..4a9f2e9 --- /dev/null +++ b/pysam/cbcftools_util.h @@ -0,0 +1,6 @@ +#ifndef CBCFTOOLS_UTIL_H +#define CBCFTOOLS_UTIL_H + +int bcftools_main(int argc, char *argv[]); + +#endif diff --git a/pysam/csamtools_util.h b/pysam/csamtools_util.h new file mode 100644 index 0000000..0a03c13 --- /dev/null +++ b/pysam/csamtools_util.h @@ -0,0 +1,6 @@ +#ifndef CSAMTOOLS_UTIL_H +#define CSAMTOOLS_UTIL_H + +int samtools_main(int argc, char *argv[]); + +#endif diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h index f0d582c..c714986 100644 --- a/pysam/htslib_util.h +++ b/pysam/htslib_util.h @@ -92,36 +92,16 @@ static inline int pysam_bam_get_l_aux(bam1_t * b) { static inline char pysam_bam_seqi(uint8_t * s, int i) { return bam_seqi(s,i);} -// Wrapping bit field access in bam1_core_t -// bit fields not supported in cython and due -// to endian-ness it is not clear which part -// of the bit-field is in the higher or lower bytes. -static inline uint16_t pysam_get_bin(bam1_t * b) { - return b->core.bin;} - static inline uint8_t pysam_get_qual(bam1_t * b) { return b->core.qual;} -static inline uint8_t pysam_get_l_qname(bam1_t * b) { - return b->core.l_qname;} - -static inline uint16_t pysam_get_flag(bam1_t * b) { - return b->core.flag;} static inline uint16_t pysam_get_n_cigar(bam1_t * b) { return b->core.n_cigar;} -static inline void pysam_set_bin(bam1_t * b, uint16_t v) { - b->core.bin=v;} - static inline void pysam_set_qual(bam1_t * b, uint8_t v) { b->core.qual=v;} -static inline void pysam_set_l_qname(bam1_t * b, uint8_t v) { - b->core.l_qname=v;} - -static inline void pysam_set_flag(bam1_t * b, uint16_t v) { - b->core.flag=v;} static inline void pysam_set_n_cigar(bam1_t * b, uint16_t v) { b->core.n_cigar=v;} diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index f1d59d1..8441313 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -19,15 +19,9 @@ cdef extern from "htslib_util.h": int pysam_bam_get_l_aux(bam1_t * b) char pysam_bam_seqi(uint8_t * s, int i) - uint16_t pysam_get_bin(bam1_t * b) uint8_t pysam_get_qual(bam1_t * b) - uint8_t pysam_get_l_qname(bam1_t * b) - uint16_t pysam_get_flag(bam1_t * b) uint16_t pysam_get_n_cigar(bam1_t * b) - void pysam_set_bin(bam1_t * b, uint16_t v) void pysam_set_qual(bam1_t * b, uint8_t v) - void pysam_set_l_qname(bam1_t * b, uint8_t v) - void pysam_set_flag(bam1_t * b, uint16_t v) void pysam_set_n_cigar(bam1_t * b, uint16_t v) void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index c95bb13..73d426a 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -61,7 +61,7 @@ import struct cimport cython from cpython cimport array as c_array from cpython.version cimport PY_MAJOR_VERSION -from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize +from cpython cimport PyBytes_FromStringAndSize from libc.string cimport strchr from cpython cimport array as c_array @@ -281,6 +281,9 @@ cdef inline packTags(tags): len(value)] + list(value)) elif isinstance(value, array.array): + valuetype = value.typecode + if valuetype not in datatype2format: + valuetype = None # binary tags from arrays if valuetype is None: array_typecode = map_typecode_python_to_htslib(ord(value.typecode)) @@ -325,9 +328,41 @@ cdef inline packTags(tags): return "".join(fmts), args -cdef inline int32_t calculateQueryLength(bam1_t * src): +cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src): """return query length computed from CIGAR alignment. + Length ignores hard-clipped bases. + + Return 0 if there is no CIGAR alignment. + """ + + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + + if cigar_p == NULL: + return 0 + + cdef uint32_t k, qpos + cdef int op + qpos = 0 + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + + if op == BAM_CMATCH or \ + op == BAM_CINS or \ + op == BAM_CSOFT_CLIP or \ + op == BAM_CEQUAL or \ + op == BAM_CDIFF: + qpos += cigar_p[k] >> BAM_CIGAR_SHIFT + + return qpos + + +cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src): + """return query length computed from CIGAR alignment. + + Length includes hard-clipped bases. + Return 0 if there is no CIGAR alignment. """ @@ -356,44 +391,45 @@ cdef inline int32_t calculateQueryLength(bam1_t * src): cdef inline int32_t getQueryStart(bam1_t *src) except -1: cdef uint32_t * cigar_p - cdef uint32_t k, op cdef uint32_t start_offset = 0 + cdef uint32_t k, op - if pysam_get_n_cigar(src): - cigar_p = pysam_bam_get_cigar(src); - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - if op == BAM_CHARD_CLIP: - if start_offset != 0 and start_offset != src.core.l_qseq: - PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') - return -1 - elif op == BAM_CSOFT_CLIP: - start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT - else: - break + cigar_p = pysam_bam_get_cigar(src); + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + if op == BAM_CHARD_CLIP: + if start_offset != 0 and start_offset != src.core.l_qseq: + raise ValueError('Invalid clipping in CIGAR string') + elif op == BAM_CSOFT_CLIP: + start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break return start_offset cdef inline int32_t getQueryEnd(bam1_t *src) except -1: - cdef uint32_t * cigar_p - cdef uint32_t k, op + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) cdef uint32_t end_offset = src.core.l_qseq + cdef uint32_t k, op # if there is no sequence, compute length from cigar string if end_offset == 0: - end_offset = calculateQueryLength(src) - - # walk backwards in cigar string - if pysam_get_n_cigar(src) > 1: - cigar_p = pysam_bam_get_cigar(src); + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + if op == BAM_CMATCH or \ + op == BAM_CINS or \ + op == BAM_CEQUAL or \ + op == BAM_CDIFF or \ + (op == BAM_CSOFT_CLIP and end_offset == 0): + end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT + else: + # walk backwards in cigar string for k from pysam_get_n_cigar(src) > k >= 1: op = cigar_p[k] & BAM_CIGAR_MASK if op == BAM_CHARD_CLIP: - if end_offset != 0 and end_offset != src.core.l_qseq: - PyErr_SetString(ValueError, - 'Invalid clipping in CIGAR string') - return -1 + if end_offset != src.core.l_qseq: + raise ValueError('Invalid clipping in CIGAR string') elif op == BAM_CSOFT_CLIP: end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT else: @@ -748,10 +784,13 @@ cdef class AlignedSegment: if t == o: return 0 + cdef uint8_t *a = &t.core + cdef uint8_t *b = &o.core + retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) - if retval: return retval + # cmp(t.l_data, o.l_data) retval = (t.l_data > o.l_data) - (t.l_data < o.l_data) if retval: @@ -819,49 +858,60 @@ cdef class AlignedSegment: property query_name: """the query template name (None if not present)""" def __get__(self): - cdef bam1_t * src - src = self._delegate - if pysam_get_l_qname(src) == 0: + + cdef bam1_t * src = self._delegate + if src.core.l_qname == 0: return None + return charptr_to_str(pysam_bam_get_qname(src)) def __set__(self, qname): + if qname is None or len(qname) == 0: return - if len(qname) >= 255: - raise ValueError("query length out of range {} > 254".format( + # See issue #447 + # (The threshold is 252 chars, but this includes a \0 byte. + if len(qname) > 251: + raise ValueError("query length out of range {} > 251".format( len(qname))) qname = force_bytes(qname) - cdef bam1_t * src - cdef int l - cdef char * p + cdef bam1_t * src = self._delegate + # the qname is \0 terminated + cdef uint8_t l = len(qname) + 1 - src = self._delegate - p = pysam_bam_get_qname(src) + cdef char * p = pysam_bam_get_qname(src) + cdef uint8_t l_extranul = 0 + + if l % 4 != 0: + l_extranul = 4 - l % 4 - # the qname is \0 terminated - l = len(qname) + 1 pysam_bam_update(src, - pysam_get_l_qname(src), - l, + src.core.l_qname, + l + l_extranul, p) - pysam_set_l_qname(src, l) - + src.core.l_extranul = l_extranul + src.core.l_qname = l + l_extranul + # re-acquire pointer to location in memory # as it might have moved p = pysam_bam_get_qname(src) strncpy(p, qname, l) + # x might be > 255 + cdef uint16_t x = 0 + + for x from l <= x < l + l_extranul: + p[x] = '\0' property flag: """properties flag""" def __get__(self): - return pysam_get_flag(self._delegate) + return self._delegate.core.flag def __set__(self, flag): - pysam_set_flag(self._delegate, flag) + self._delegate.core.flag = flag property reference_name: """:term:`reference` name (None if no AlignmentFile is associated)""" @@ -893,19 +943,17 @@ cdef class AlignedSegment: src = self._delegate src.core.pos = pos if pysam_get_n_cigar(src): - pysam_set_bin(src, - hts_reg2bin( - src.core.pos, - bam_endpos(src), - 14, - 5)) + src.core.bin = hts_reg2bin( + src.core.pos, + bam_endpos(src), + 14, + 5) else: - pysam_set_bin(src, - hts_reg2bin( - src.core.pos, - src.core.pos + 1, - 14, - 5)) + src.core.bin = hts_reg2bin( + src.core.pos, + src.core.pos + 1, + 14, + 5) property mapping_quality: """mapping quality""" @@ -1156,9 +1204,9 @@ cdef class AlignedSegment: property bin: """properties bin""" def __get__(self): - return pysam_get_bin(self._delegate) + return self._delegate.core.bin def __set__(self, bin): - pysam_set_bin(self._delegate, bin) + self._delegate.core.bin = bin ########################################################## @@ -1344,14 +1392,17 @@ cdef class AlignedSegment: This the index of the first base in :attr:`seq` that is not soft-clipped. - """ def __get__(self): return getQueryStart(self._delegate) property query_alignment_end: """end index of the aligned query portion of the sequence (0-based, - exclusive)""" + exclusive) + + This the index just past the last base in :attr:`seq` that is not + soft-clipped. + """ def __get__(self): return getQueryEnd(self._delegate) @@ -1408,26 +1459,30 @@ cdef class AlignedSegment: return result - def infer_query_length(self, always=True): - """inferred read length from CIGAR string. + def infer_query_length(self, always=False): + """infer query length from sequence or CIGAR alignment. - If *always* is set to True, the read length - will be always inferred. If set to False, the length - of the read sequence will be returned if it is - available. + This method deduces the query length from the CIGAR alignment + but does not include hard-clipped bases. - Returns None if CIGAR string is not present. - """ + Returns None if CIGAR alignment is not present. - cdef uint32_t * cigar_p - cdef bam1_t * src + If *always* is set to True, `infer_read_length` is used instead. + This is deprecated and only present for backward compatibility. + """ + if always is True: + return self.infer_read_length() + return calculateQueryLengthWithoutHardClipping(self._delegate) - src = self._delegate + def infer_read_length(self): + """infer read length from CIGAR alignment. - if not always and src.core.l_qseq: - return src.core.l_qseq + This method deduces the read length from the CIGAR alignment + including hard-clipped bases. - return calculateQueryLength(src) + Returns None if CIGAR alignment is not present. + """ + return calculateQueryLengthWithHardClipping(self._delegate) def get_reference_sequence(self): """return the reference sequence. @@ -1677,7 +1732,9 @@ cdef class AlignedSegment: +-----+--------------+-----+ |X |BAM_CDIFF |8 | +-----+--------------+-----+ - |NM |NM tag |9 | + |B |BAM_CBACK |9 | + +-----+--------------+-----+ + |NM |NM tag |10 | +-----+--------------+-----+ If no cigar string is present, empty arrays will be returned. @@ -1756,6 +1813,8 @@ cdef class AlignedSegment: +-----+--------------+-----+ |X |BAM_CDIFF |8 | +-----+--------------+-----+ + |B |BAM_CBACK |9 | + +-----+--------------+-----+ .. note:: The output is a list of (operation, length) tuples, such as @@ -1823,12 +1882,11 @@ cdef class AlignedSegment: k += 1 ## setting the cigar string requires updating the bin - pysam_set_bin(src, - hts_reg2bin( - src.core.pos, - bam_endpos(src), - 14, - 5)) + src.core.bin = hts_reg2bin( + src.core.pos, + bam_endpos(src), + 14, + 5) cpdef set_tag(self, @@ -2477,7 +2535,71 @@ cdef class PileupRead: def __get__(self): return self._is_refskip + +cpdef enum CIGAR_OPS: + CMATCH = 0 + CINS = 1 + CDEL = 2 + CREF_SKIP = 3 + CSOFT_CLIP = 4 + CHARD_CLIP = 5 + CPAD = 6 + CEQUAL = 7 + CDIFF = 8 + CBACK = 9 + + +cpdef enum SAM_FLAGS: + # the read is paired in sequencing, no matter whether it is mapped in a pair + FPAIRED = 1 + # the read is mapped in a proper pair + FPROPER_PAIR = 2 + # the read itself is unmapped; conflictive with FPROPER_PAIR + FUNMAP = 4 + # the mate is unmapped + FMUNMAP = 8 + # the read is mapped to the reverse strand + FREVERSE = 16 + # the mate is mapped to the reverse strand + FMREVERSE = 32 + # this is read1 + FREAD1 = 64 + # this is read2 + FREAD2 = 128 + # not primary alignment + FSECONDARY = 256 + # QC failure + FQCFAIL = 512 + # optical or PCR duplicate + FDUP = 1024 + # supplementary alignment + FSUPPLEMENTARY = 2048 + + __all__ = [ "AlignedSegment", "PileupColumn", - "PileupRead"] + "PileupRead", + "CMATCH", + "CINS", + "CDEL", + "CREF_SKIP", + "CSOFT_CLIP", + "CHARD_CLIP", + "CPAD", + "CEQUAL", + "CDIFF", + "CBACK", + "FPAIRED", + "FPROPER_PAIR", + "FUNMAP", + "FMUNMAP", + "FREVERSE", + "FMREVERSE", + "FREAD1", + "FREAD2", + "FSECONDARY", + "FQCFAIL", + "FDUP", + "FSUPPLEMENTARY"] + diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 2161f87..0b248c1 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -7,16 +7,16 @@ # The principal classes defined in this module are: # # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files -# +# # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping # the original sort order intact -# +# # Additionally this module defines numerous additional classes that # are part of the internal API. These are: -# +# # Various iterator classes to iterate over alignments in sequential # (IteratorRow) or in a stacked fashion (IteratorColumn): -# +# # class IteratorRow # class IteratorRowRegion # class IteratorRowHead @@ -76,15 +76,9 @@ else: cimport cython ######################################################## -## Constants and global variables - -# defines imported from samtools -DEF SEEK_SET = 0 -DEF SEEK_CUR = 1 -DEF SEEK_END = 2 - +## global variables # maximum genomic coordinace -cdef int MAX_POS = 2 << 29 +cdef int MAX_POS = 2 << 29 # valid types for SAM headers VALID_HEADER_TYPES = {"HD" : dict, @@ -98,7 +92,7 @@ VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") # default type conversions within SAM header records KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, - "SQ" : {"SN" : str, "LN" : int, "AS" : str, + "SQ" : {"SN" : str, "LN" : int, "AS" : str, "M5" : str, "SP" : str, "UR" : str, "AH" : str,}, "RG" : {"ID" : str, "CN" : str, "DS" : str, @@ -106,7 +100,7 @@ KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, "LB" : str, "PG" : str, "PI" : str, "PL" : str, "PM" : str, "PU" : str, "SM" : str,}, - "PG" : {"ID" : str, "PN" : str, "CL" : str, + "PG" : {"ID" : str, "PN" : str, "CL" : str, "PP" : str, "DS" : str, "VN" : str,},} # output order of fields within records. Ensure that CL is at @@ -147,20 +141,15 @@ def build_header_line(fields, record): return "\t".join(line) -cdef bam_hdr_t * build_header(new_header): +cdef bam_hdr_t * build_header_from_dict(new_header): '''return a new header built from a dictionary in `new_header`. This method inserts the text field, target_name and target_len. ''' - - lines = [] - - # check if hash exists + cdef list lines = [] # create new header and copy old data - cdef bam_hdr_t * dest - - dest = bam_hdr_init() + cdef bam_hdr_t * dest = bam_hdr_init() # first: defined tags for record in VALID_HEADERS: @@ -219,13 +208,63 @@ cdef bam_hdr_t * build_header(new_header): return dest +cdef bam_hdr_t * build_header_from_list(reference_names, + reference_lengths, + add_sq_text=True, + text=None): + + assert len(reference_names) == len(reference_lengths), \ + "unequal names and lengths of reference sequences" + + cdef bam_hdr_t * dest = bam_hdr_init() + + # allocate and fill header + reference_names = [force_bytes(ref) for ref in reference_names] + dest.n_targets = len(reference_names) + n = 0 + for x in reference_names: + n += len(x) + 1 + dest.target_name = calloc(n, sizeof(char*)) + dest.target_len = calloc(n, sizeof(uint32_t)) + for x from 0 <= x < dest.n_targets: + dest.target_len[x] = reference_lengths[x] + name = reference_names[x] + dest.target_name[x] = calloc( + len(name) + 1, sizeof(char)) + strncpy(dest.target_name[x], name, len(name)) + + # Optionally, if there is no text, add a SAM + # compatible header to output file. + if text is None and add_sq_text: + text = [] + for x from 0 <= x < dest.n_targets: + text.append("@SQ\tSN:%s\tLN:%s\n" % \ + (force_str(reference_names[x]), + reference_lengths[x])) + text = ''.join(text) + + cdef char * ctext = NULL + + if text is not None: + # copy without \0 + text = force_bytes(text) + ctext = text + dest.l_text = strlen(ctext) + dest.text = calloc( + strlen(ctext), sizeof(char)) + memcpy(dest.text, ctext, strlen(ctext)) + + return dest + + cdef class AlignmentFile(HTSFile): """AlignmentFile(filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=NULL, header=None, add_sq_text=False, check_header=True, check_sq=True, - reference_filename=None, filename=None, duplicate_filehandle=True) + reference_filename=None, filename=None, duplicate_filehandle=True, + ignore_truncation=False) - A :term:`SAM`/:term:`BAM` formatted file. + A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. If `filepath_or_object` is a string, the file is automatically opened. If `filepath_or_object` is a python File object, the @@ -245,7 +284,7 @@ cdef class AlignmentFile(HTSFile): :class:`~pysam.AlignmentFile`). 2. If `header` is given, the header is built from a - multi-level dictionary. + multi-level dictionary. 3. If `text` is given, new header text is copied from raw text. @@ -297,20 +336,27 @@ cdef class AlignmentFile(HTSFile): when writing, use the string provided as the header reference_names : list - see referece_lengths + see reference_lengths reference_lengths : list - when writing, build header from list of chromosome names and - lengths. By default, 'SQ' and 'LN' tags will be added to the - header text. This option can be changed by unsetting the flag - `add_sq_text`. + when writing or opening a SAM file without header build header + from list of chromosome names and lengths. By default, 'SQ' + and 'LN' tags will be added to the header text. This option + can be changed by unsetting the flag `add_sq_text`. add_sq_text : bool do not add 'SQ' and 'LN' tags to header. This option permits construction :term:`SAM` formatted files without a header. + add_sam_header : bool + when outputting SAM the default is to output a header. This is + equivalent to opening the file in 'wh' mode. If this option is + set to False, no header will be output. To read such a file, + set `check_header=False`. + check_header : bool - when reading, check if header is present (default=True) + obsolete: when reading a SAM file, check if header is present + (default=True) check_sq : bool when reading, check if SQ entries are present in header @@ -326,7 +372,7 @@ cdef class AlignmentFile(HTSFile): Alternative to filepath_or_object. Filename of the file to be opened. - duplicate_filehandle: bool + duplicate_filehandle: bool By default, file handles passed either directly or through File-like objects will be duplicated before passing them to htslib. The duplication prevents issues where the same stream @@ -334,6 +380,10 @@ cdef class AlignmentFile(HTSFile): high-level python object. Set to False to turn off duplication. + ignore_truncation: bool + Issue a warning, instead of raising an error if the current file + appears to be truncated due to a missing EOF marker. Only applies + to bgzipped formats. (Default=False) """ def __cinit__(self, *args, **kwargs): @@ -393,16 +443,19 @@ cdef class AlignmentFile(HTSFile): header=None, port=None, add_sq_text=True, + add_sam_header=True, check_header=True, check_sq=True, filepath_index=None, referencenames=None, referencelengths=None, - duplicate_filehandle=True): + duplicate_filehandle=True, + ignore_truncation=False): '''open a sam, bam or cram formatted file. If _open is called on an existing file, the current file will be closed and a new file will be opened. + ''' cdef char *cfilename = NULL cdef char *creference_filename = NULL @@ -423,6 +476,9 @@ cdef class AlignmentFile(HTSFile): if mode is None: mode = "r" + if add_sam_header and mode == "w": + mode = "wh" + assert mode in ("r", "w", "rb", "wb", "wh", "wbu", "rU", "wb0", "rc", "wc"), \ @@ -468,10 +524,6 @@ cdef class AlignmentFile(HTSFile): self.reference_filename = reference_filename = encode_filename( reference_filename) - cdef char * ctext - cdef hFILE * fp - ctext = NULL - if mode[0] == 'w': # open file for writing @@ -479,50 +531,18 @@ cdef class AlignmentFile(HTSFile): if template: self.header = bam_hdr_dup(template.header) elif header: - self.header = build_header(header) + self.header = build_header_from_dict(header) else: - # build header from a target names and lengths assert reference_names and reference_lengths, \ ("either supply options `template`, `header` " "or both `reference_names` and `reference_lengths` " "for writing") - assert len(reference_names) == len(reference_lengths), \ - "unequal names and lengths of reference sequences" - - # allocate and fill header - reference_names = [force_bytes(ref) for ref in reference_names] - self.header = bam_hdr_init() - self.header.n_targets = len(reference_names) - n = 0 - for x in reference_names: - n += len(x) + 1 - self.header.target_name = calloc(n, sizeof(char*)) - self.header.target_len = calloc(n, sizeof(uint32_t)) - for x from 0 <= x < self.header.n_targets: - self.header.target_len[x] = reference_lengths[x] - name = reference_names[x] - self.header.target_name[x] = calloc( - len(name) + 1, sizeof(char)) - strncpy(self.header.target_name[x], name, len(name)) - - # Optionally, if there is no text, add a SAM - # compatible header to output file. - if text is None and add_sq_text: - text = [] - for x from 0 <= x < self.header.n_targets: - text.append("@SQ\tSN:%s\tLN:%s\n" % \ - (force_str(reference_names[x]), - reference_lengths[x])) - text = ''.join(text) - - if text is not None: - # copy without \0 - text = force_bytes(text) - ctext = text - self.header.l_text = strlen(ctext) - self.header.text = calloc( - strlen(ctext), sizeof(char)) - memcpy(self.header.text, ctext, strlen(ctext)) + # build header from a target names and lengths + self.header = build_header_from_list( + reference_names, + reference_lengths, + add_sq_text=add_sq_text, + text=text) self.htsfile = self._open_htsfile() @@ -542,7 +562,7 @@ cdef class AlignmentFile(HTSFile): # open file for reading if not self._exists(): raise IOError("file `%s` not found" % self.filename) - + self.htsfile = self._open_htsfile() if self.htsfile == NULL: @@ -553,6 +573,8 @@ cdef class AlignmentFile(HTSFile): if self.htsfile.format.category != sequence_data: raise ValueError("file does not contain alignment data") + self.check_truncation(ignore_truncation) + # bam files require a valid header if self.is_bam or self.is_cram: with nogil: @@ -562,16 +584,21 @@ cdef class AlignmentFile(HTSFile): "file does not have valid header (mode='%s') " "- is it BAM format?" % mode ) else: - # in sam files it is optional (htsfile full of - # unmapped reads) - if check_header: + # in sam files a header is optional, but requires + # reference names and lengths + if reference_names and reference_lengths: + self.header = build_header_from_list( + reference_names, + reference_lengths, + add_sq_text=add_sq_text, + text=text) + else: with nogil: self.header = sam_hdr_read(self.htsfile) if self.header == NULL: raise ValueError( - "file does not have valid header (mode='%s') " - "- is it SAM format?" % mode ) - # self.header.ignore_sam_err = True + "file does not have valid header (mode='%s'), " + "please provide reference_names and reference_lengths") # set filename with reference sequences if self.is_cram and reference_filename: @@ -669,7 +696,7 @@ cdef class AlignmentFile(HTSFile): if not self.is_open: raise ValueError("I/O operation on closed file") if not 0 <= tid < self.header.n_targets: - raise ValueError("reference_id %i out of range 0<=tid<%i" % + raise ValueError("reference_id %i out of range 0<=tid<%i" % (tid, self.header.n_targets)) return charptr_to_str(self.header.target_name[tid]) @@ -686,7 +713,7 @@ cdef class AlignmentFile(HTSFile): Alternatively, a samtools :term:`region` string can be supplied. - + If any of the coordinates are missing they will be replaced by the minimum (`start`) or maximum (`end`) coordinate. @@ -695,14 +722,14 @@ cdef class AlignmentFile(HTSFile): Returns ------- - + tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The flag indicates whether no coordinates were supplied and the genomic region is the complete genomic space. Raises ------ - + ValueError for invalid or out of bounds regions. @@ -711,6 +738,9 @@ cdef class AlignmentFile(HTSFile): cdef long long rstart cdef long long rend + if reference is None and tid is None and region is None: + return 0, 0, 0, 0 + rtid = -1 rstart = 0 rend = MAX_POS @@ -735,11 +765,11 @@ cdef class AlignmentFile(HTSFile): if len(parts) >= 3: rend = int(parts[2]) - if not reference: - return 0, 0, 0, 0 - if tid is not None: rtid = tid + if rtid < 0 or rtid >= self.header.n_targets: + raise IndexError("invalid reference, {} out of range 0-{}".format( + rtid, self.header.n_targets)) else: rtid = self.gettid(reference) @@ -764,7 +794,7 @@ cdef class AlignmentFile(HTSFile): tid=None, until_eof=False, multiple_iterators=False): - """fetch reads aligned in a :term:`region`. + """fetch reads aligned in a :term:`region`. See :meth:`AlignmentFile.parse_region` for more information on genomic regions. @@ -789,7 +819,7 @@ cdef class AlignmentFile(HTSFile): Parameters ---------- - + until_eof : bool If `until_eof` is True, all reads from the current file @@ -797,7 +827,7 @@ cdef class AlignmentFile(HTSFile): file. Using this option will also fetch unmapped reads. multiple_iterators : bool - + If `multiple_iterators` is True, multiple iterators on the same file can be used at the same time. The iterator returned will receive its own copy of a filehandle to @@ -841,7 +871,7 @@ cdef class AlignmentFile(HTSFile): if has_coord: return IteratorRowRegion( - self, rtid, rstart, rend, + self, rtid, rstart, rend, multiple_iterators=multiple_iterators) else: if until_eof: @@ -857,22 +887,17 @@ cdef class AlignmentFile(HTSFile): else: if has_coord: raise ValueError( - "fetching by region is not available for sam files") + "fetching by region is not available for SAM files") - if self.header == NULL: + if multiple_iterators == True: raise ValueError( - "fetch called for htsfile without header") + "multiple iterators not implemented for SAM files") - # check if targets are defined - # give warning, sam_read1 segfaults - if self.header.n_targets == 0: - warnings.warn("fetch called for htsfile without header") - return IteratorRowAll(self, multiple_iterators=multiple_iterators) def head(self, n, multiple_iterators=True): - '''return an iterator over the first n alignments. + '''return an iterator over the first n alignments. This iterator is is useful for inspecting the bam-file. @@ -880,15 +905,15 @@ cdef class AlignmentFile(HTSFile): ---------- multiple_iterators : bool - + is set to True by default in order to avoid changing the current file position. - + Returns ------- - + an iterator over a collection of reads - + ''' return IteratorRowHead(self, n, multiple_iterators=multiple_iterators) @@ -903,14 +928,14 @@ cdef class AlignmentFile(HTSFile): not re-opened the file. .. note:: - + This method is too slow for high-throughput processing. If a read needs to be processed with its mate, work from a read name sorted file or, better, cache reads. Returns ------- - + :class:`~pysam.AlignedSegment` : the mate Raises @@ -1061,7 +1086,7 @@ cdef class AlignmentFile(HTSFile): Parameters ---------- - + reference : string reference_name of the genomic region (chromosome) @@ -1070,12 +1095,12 @@ cdef class AlignmentFile(HTSFile): end : int end of the genomic region - + region : string a region string in samtools format. until_eof : bool - count until the end of the file, possibly including + count until the end of the file, possibly including unmapped reads as well. read_callback: string or function @@ -1135,7 +1160,7 @@ cdef class AlignmentFile(HTSFile): return counter @cython.boundscheck(False) # we do manual bounds checking - def count_coverage(self, + def count_coverage(self, reference=None, start=None, end=None, @@ -1150,7 +1175,7 @@ cdef class AlignmentFile(HTSFile): Parameters ---------- - + reference : string reference_name of the genomic region (chromosome) @@ -1165,7 +1190,7 @@ cdef class AlignmentFile(HTSFile): quality_threshold : int quality_threshold is the minimum quality score (in phred) a - base has to reach to be counted. + base has to reach to be counted. read_callback: string or function @@ -1196,7 +1221,7 @@ cdef class AlignmentFile(HTSFile): four array.arrays of the same length in order A C G T : tuple """ - + cdef int _start = start cdef int _stop = end cdef int length = _stop - _start @@ -1221,7 +1246,7 @@ cdef class AlignmentFile(HTSFile): filter_method = 1 elif read_callback == "nofilter": filter_method = 2 - + cdef int _threshold = quality_threshold for read in self.fetch(reference=reference, start=start, @@ -1283,16 +1308,22 @@ cdef class AlignmentFile(HTSFile): return res def close(self): - ''' - closes the :class:`pysam.AlignmentFile`.''' + '''closes the :class:`pysam.AlignmentFile`.''' if self.htsfile == NULL: return cdef int ret = hts_close(self.htsfile) - hts_idx_destroy(self.index) self.htsfile = NULL + if self.index != NULL: + hts_idx_destroy(self.index) + self.index = NULL + + if self.header != NULL: + bam_hdr_destroy(self.header) + self.header = NULL + if ret < 0: global errno if errno == EPIPE: @@ -1301,28 +1332,23 @@ cdef class AlignmentFile(HTSFile): raise OSError(errno, force_str(strerror(errno))) def __dealloc__(self): - # remember: dealloc cannot call other methods - # note: no doc string - # note: __del__ is not called. - - # FIXME[kbj]: isn't self.close a method? I've been duplicating - # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty - # solution and perhaps unnecessary given that calling self.close has - # been working for years. - # AH: I have removed the call to close. Even though it is working, - # it seems to be dangerous according to the documentation as the - # object be partially deconstructed already. cdef int ret = 0 if self.htsfile != NULL: ret = hts_close(self.htsfile) - hts_idx_destroy(self.index); self.htsfile = NULL - bam_destroy1(self.b) + if self.index != NULL: + hts_idx_destroy(self.index) + self.index = NULL + if self.header != NULL: bam_hdr_destroy(self.header) + self.header = NULL + if self.b: + bam_destroy1(self.b) + self.b = NULL if ret < 0: global errno @@ -1330,7 +1356,7 @@ cdef class AlignmentFile(HTSFile): errno = 0 else: raise OSError(errno, force_str(strerror(errno))) - + cpdef int write(self, AlignedSegment read) except -1: ''' write a single :class:`pysam.AlignedSegment` to disk. @@ -1342,7 +1368,7 @@ cdef class AlignmentFile(HTSFile): Returns ------- - + int : the number of bytes written. If the file is closed, this will be 0. ''' @@ -1387,7 +1413,7 @@ cdef class AlignmentFile(HTSFile): return self.header.n_targets property references: - """tuple with the names of :term:`reference` sequences. This is a + """tuple with the names of :term:`reference` sequences. This is a read-only attribute""" def __get__(self): if not self.is_open: raise ValueError( "I/O operation on closed file" ) @@ -1455,10 +1481,10 @@ cdef class AlignmentFile(HTSFile): property text: '''string with the full contents of the :term:`sam file` header as a - string. + string. This is a read-only attribute. - + See :attr:`pysam.AlignmentFile.header` to get a parsed representation of the header. ''' @@ -1468,13 +1494,13 @@ cdef class AlignmentFile(HTSFile): return from_string_and_size(self.header.text, self.header.l_text) property header: - """two-level dictionay with header information from the file. - + """two-level dictionay with header information from the file. + This is a read-only attribute. The first level contains the record (``HD``, ``SQ``, etc) and the second level contains the fields (``VN``, ``LN``, etc). - + The parser is validating and will raise an AssertionError if if encounters any record or field tags that are not part of the SAM specification. Use the @@ -1494,7 +1520,7 @@ cdef class AlignmentFile(HTSFile): raise ValueError( "I/O operation on closed file" ) result = {} - + if self.header.text != NULL: # convert to python string (note: call self.text to # create 0-terminated string) @@ -1518,7 +1544,7 @@ cdef class AlignmentFile(HTSFile): x = {} for idx, field in enumerate(fields[1:]): - if ":" not in field: + if ":" not in field: raise ValueError("malformatted header: no ':' in field" ) key, value = field.split(":", 1) if key in ("CL",): @@ -1576,7 +1602,7 @@ cdef class AlignmentFile(HTSFile): "can not iterate over samfile without header") return self - cdef bam1_t * getCurrent( self ): + cdef bam1_t * getCurrent(self): return self.b cdef int cnext(self): @@ -1598,12 +1624,12 @@ cdef class AlignmentFile(HTSFile): raise IOError('truncated file') else: raise StopIteration - + # Compatibility functions for pysam < 0.8.3 def gettid(self, reference): """deprecated, use get_tid() instead""" return self.get_tid(reference) - + def getrname(self, tid): """deprecated, use get_reference_name() instead""" return self.get_reference_name(tid) @@ -1637,7 +1663,7 @@ cdef class IteratorRow: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): cdef char *cfilename cdef char *creference_filename - + if not samfile.is_open: raise ValueError("I/O operation on closed file") @@ -1711,7 +1737,7 @@ cdef class IteratorRowRegion(IteratorRow): tid, beg, end) - + def __iter__(self): return self @@ -1766,7 +1792,7 @@ cdef class IteratorRowHead(IteratorRow): def __iter__(self): return self - cdef bam1_t * getCurrent( self ): + cdef bam1_t * getCurrent(self): return self.b cdef int cnext(self): @@ -1814,7 +1840,7 @@ cdef class IteratorRowAll(IteratorRow): def __iter__(self): return self - cdef bam1_t * getCurrent( self ): + cdef bam1_t * getCurrent(self): return self.b cdef int cnext(self): @@ -1988,7 +2014,7 @@ cdef int __advance_snpcalls(void * data, bam1_t * b): the samtools pileup. ''' - # Note that this method requries acces to some + # Note that this method requries acces to some # functions in the samtools code base and is thus # not htslib only. # The functions accessed in samtools are: @@ -2029,11 +2055,13 @@ cdef int __advance_snpcalls(void * data, bam1_t * b): skip = 0 # realign read - changes base qualities - if d.seq != NULL and is_cns and not is_nobaq: - bam_prob_realn(b, d.seq) + if d.seq != NULL and is_cns and not is_nobaq: + # flag: + # apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; + sam_prob_realn(b, d.seq, d.seq_len, 0) if d.seq != NULL and capQ_thres > 10: - q = bam_cap_mapQ(b, d.seq, capQ_thres) + q = sam_cap_mapq(b, d.seq, d.seq_len, capQ_thres) if q < 0: skip = 1 elif b.core.qual > q: @@ -2089,7 +2117,7 @@ cdef class IteratorColumn: Valid values are None, "all" (default), "nofilter" or "samtools". See AlignmentFile.pileup for description. - + fastafile A :class:`~pysam.FastaFile` object @@ -2271,7 +2299,7 @@ cdef class IteratorColumnRegion(IteratorColumn): if self.plp == NULL: raise StopIteration - + if self.truncate: if self.start > self.pos: continue if self.pos >= self.end: raise StopIteration @@ -2313,7 +2341,7 @@ cdef class IteratorColumnAllRefs(IteratorColumn): self.pos, self.n_plp, self.samfile) - + # otherwise, proceed to next reference or stop self.tid += 1 if self.tid < self.samfile.nreferences: @@ -2465,7 +2493,7 @@ cdef class IndexedReads: Raises ------ - + KeyError if the `query_name` is not in the index. diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd index fc7f56c..1d4129b 100644 --- a/pysam/libcbcf.pxd +++ b/pysam/libcbcf.pxd @@ -38,45 +38,44 @@ from pysam.libchtslib cimport * cdef class VariantHeader(object): cdef bcf_hdr_t *ptr - cpdef VariantRecord new_record(self) cdef _subset_samples(self, include_samples) cdef class VariantHeaderRecord(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef bcf_hrec_t *ptr cdef class VariantHeaderRecords(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef class VariantHeaderContigs(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef class VariantHeaderSamples(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef class VariantContig(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef int id cdef class VariantMetadata(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef int type cdef int id cdef class VariantHeaderMetadata(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef int32_t type cdef class VariantRecord(object): - cdef VariantHeader header + cdef readonly VariantHeader header cdef bcf1_t *ptr @@ -107,7 +106,7 @@ cdef class BaseIndex(object): cdef class BCFIndex(BaseIndex): - cdef VariantHeader header + cdef readonly VariantHeader header cdef hts_idx_t *ptr @@ -139,6 +138,4 @@ cdef class VariantFile(HTSFile): cdef readonly bint is_reading # true if file has begun reading records cdef readonly bint header_written # true if header has already been written - cpdef VariantRecord new_record(self) - cpdef int write(self, VariantRecord record) except -1 diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index 8f40451..9413e70 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -101,9 +101,6 @@ from cpython.version cimport PY_MAJOR_VERSION from pysam.libchtslib cimport HTSFile, hisremote -from warnings import warn - - __all__ = ['VariantFile', 'VariantHeader', 'VariantHeaderRecord', @@ -129,6 +126,13 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_ from pysam.libcutils cimport encode_filename, from_string_and_size +######################################################################## +######################################################################## +## Sentinel object +######################################################################## + +cdef object _nothing = object() + ######################################################################## ######################################################################## ## VCF/BCF string intern system @@ -154,6 +158,55 @@ cdef inline bcf_str_cache_get_charptr(const char* s): return val +######################################################################## +######################################################################## +## Genotype math +######################################################################## + +cdef int comb(int n, int k) except -1: + """Return binomial coeffient: n choose k + + >>> comb(5, 1) + 5 + >>> comb(5, 2) + 10 + >>> comb(2, 2) + 1 + >>> comb(100, 2) + 4950 + """ + if k > n: + return 0 + elif k == n: + return 1 + elif k > n // 2: + k = n - k + + cdef d, result + + d = result = n - k + 1 + for i in range(2, k + 1): + d += 1 + result *= d + result //= i + return result + + +cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1: + """Return the count of genotypes expected for the given ploidy and number of alleles. + + >>> bcf_geno_combinations(1, 2) + 2 + >>> bcf_geno_combinations(2, 2) + 3 + >>> bcf_geno_combinations(2, 3) + 6 + >>> bcf_geno_combinations(3, 2) + 4 + """ + return comb(alleles + ploidy - 1, ploidy) + + ######################################################################## ######################################################################## ## Low level type conversion helpers @@ -165,7 +218,32 @@ cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id): cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): - return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0 + return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0 + + +cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1: + if sample < 0: + raise ValueError('genotype is only valid as a format field') + + cdef int32_t *gt_arr = NULL + cdef int ngt = 0 + ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt) + + if ngt <= 0 or not gt_arr: + return 0 + + assert ngt % rec.n_sample == 0 + cdef int max_ploidy = ngt // rec.n_sample + cdef int32_t *gt = gt_arr + sample * max_ploidy + cdef int ploidy = 0 + + while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end: + gt += 1 + ploidy += 1 + + free(gt_arr) + + return bcf_geno_combinations(ploidy, rec.n_allele) cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0): @@ -185,19 +263,25 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca cdef int32_t *data32 cdef float *dataf cdef int i + cdef bytes b if not data or n <= 0: return None if type == BCF_BT_CHAR: datac = data - while n and datac[n-1] == bcf_str_vector_end: - n -= 1 - value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None - # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. - value = tuple(v or None for v in value.split(',')) if value else () - # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. + if not n: + value = () + else: + # Check if at least one null terminator is present + if datac[n-1] == bcf_str_vector_end: + # If so, create a string up to the first null terminator + b = datac + else: + # Otherwise, copy the entire block + b = datac[:n] + value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b',')) else: value = [] if type == BCF_BT_INT8: @@ -251,13 +335,13 @@ cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen): cdef float *dataf cdef ssize_t i, value_count = len(values) - assert(value_count <= n) + assert value_count <= n if bt_type == BCF_BT_CHAR: if not isinstance(values, (str, bytes)): - values = b','.join(force_bytes(v) if v is not None else b'' for v in values) + values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values) value_count = len(values) - assert(value_count <= n) + assert value_count <= n datac = data memcpy(datac, values, value_count) for i in range(value_count, n): @@ -392,7 +476,7 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values, raise TypeError('unsupported types') -cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar): +cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample): if record is None: raise ValueError('record must not be None') @@ -418,7 +502,7 @@ cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *cou elif length == BCF_VL_A: count[0] = r.n_allele - 1 elif length == BCF_VL_G: - count[0] = r.n_allele * (r.n_allele + 1) // 2 + count[0] = bcf_genotype_count(hdr, r, sample) elif length == BCF_VL_VAR: count[0] = -1 else: @@ -435,7 +519,7 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): cdef ssize_t count cdef int scalar - bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar) + bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1) if z.len == 0: if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG: @@ -466,14 +550,15 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): return value -cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type, +cdef object bcf_check_values(VariantRecord record, value, int sample, + int hl_type, int ht_type, int id, int bt_type, ssize_t bt_len, ssize_t *value_count, int *scalar, int *realloc): if record is None: raise ValueError('record must not be None') - bcf_get_value_count(record, hl_type, id, value_count, scalar) + bcf_get_value_count(record, hl_type, id, value_count, scalar, sample) # Validate values now that we know the type and size values = (value,) if not isinstance(value, (list, tuple)) else value @@ -485,11 +570,12 @@ cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_ty # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1) value_count[0] = -1 - if value_count[0] != -1 and value_count[0] != len(values): + cdef int given = len(values) + if value_count[0] != -1 and value_count[0] != given: if scalar[0]: - raise TypeError('value expected to be scalar'.format(value_count[0])) + raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given)) else: - raise TypeError('values expected to be {:d}-tuple'.format(value_count[0])) + raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given)) if ht_type == BCF_HT_REAL: for v in values: @@ -572,33 +658,29 @@ cdef bcf_info_set_value(VariantRecord record, key, value): cdef bcf_hdr_t *hdr = record.header.ptr cdef bcf1_t *r = record.ptr - cdef vdict_t *d - cdef khiter_t k cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0 cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size if bcf_unpack(r, BCF_UN_INFO) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) if info: info_id = info.key else: - d = hdr.dict[BCF_DT_ID] - k = kh_get_vdict(d, bkey) + info_id = bcf_header_get_info_id(hdr, bkey) - if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: - raise KeyError('unknown INFO') - - info_id = kh_val_vdict(d, k).id + if info_id < 0: + raise KeyError('unknown INFO: {}'.format(key)) if not check_header_id(hdr, BCF_HL_INFO, info_id): raise ValueError('Invalid header') info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) - values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id, + values = bcf_check_values(record, value, -1, + BCF_HL_INFO, info_type, info_id, info.type if info else -1, info.len if info else -1, &value_count, &scalar, &realloc) @@ -611,13 +693,16 @@ cdef bcf_info_set_value(VariantRecord record, key, value): vlen = value_count < 0 value_count = len(values) + # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS # If we can, write updated values to existing allocated storage - if info and not realloc: + if 0 and info and not realloc: r.d.shared_dirty |= BCF1_DIRTY_INF if value_count == 0: info.len = 0 - # FIXME: Check if need to free vptr if info.len > 0? + if not info.vptr: + info.vptr = &info.v1.i + elif value_count == 1: # FIXME: Check if need to free vptr if info.len > 0? if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32: @@ -626,9 +711,13 @@ cdef bcf_info_set_value(VariantRecord record, key, value): bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen) else: raise TypeError('unsupported info type code') + info.len = 1 + if not info.vptr: + info.vptr = &info.v1.i else: bcf_object_to_array(values, info.vptr, info.type, info.len, vlen) + return alloc_len = max(1, value_count) @@ -665,13 +754,13 @@ cdef bcf_info_del_value(VariantRecord record, key): if bcf_unpack(r, BCF_UN_INFO) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) if not info: raise KeyError(key) - bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar) + bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1) if value_count <= 0: null_value = () @@ -695,16 +784,16 @@ cdef bcf_format_get_value(VariantRecordSample sample, key): if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) if not fmt or not fmt.p: - raise KeyError('invalid FORMAT') + raise KeyError('invalid FORMAT: {}'.format(key)) if is_gt_fmt(hdr, fmt.id): return bcf_format_get_allele_indices(sample) - bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar) + bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index) if fmt.p and fmt.n and fmt.size: return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar) @@ -720,6 +809,10 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): if sample is None: raise ValueError('sample must not be None') + if key == 'phased': + sample.phased = bool(value) + return + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int fmt_id @@ -731,7 +824,7 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) if fmt: @@ -741,7 +834,7 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): k = kh_get_vdict(d, bkey) if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF: - raise KeyError('unknown format') + raise KeyError('unknown format: {}'.format(key)) fmt_id = kh_val_vdict(d, k).id @@ -758,7 +851,8 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT. fmt_type = BCF_HT_INT - values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id, + values = bcf_check_values(sample.record, value, sample.index, + BCF_HL_FMT, fmt_type, fmt_id, fmt.type if fmt else -1, fmt.n if fmt else -1, &value_count, &scalar, &realloc) @@ -776,7 +870,7 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): if fmt and fmt.n > alloc_len: alloc_len = fmt.n - n = bcf_hdr_nsamples(hdr) + n = r.n_sample new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen) cdef char *valp = new_values @@ -816,13 +910,13 @@ cdef bcf_format_del_value(VariantRecordSample sample, key): if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) if not fmt or not fmt.p: raise KeyError(key) - bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar) + bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index) if value_count <= 0: null_value = () @@ -840,7 +934,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample): cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef int32_t n = r.n_sample if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') @@ -900,7 +994,7 @@ cdef bcf_format_get_alleles(VariantRecordSample sample): cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr - cdef int32_t nsamples = bcf_hdr_nsamples(hdr) + cdef int32_t nsamples = r.n_sample if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') @@ -951,7 +1045,7 @@ cdef bint bcf_sample_get_phased(VariantRecordSample sample): cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef int32_t n = r.n_sample if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') @@ -1014,7 +1108,7 @@ cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef int32_t n = r.n_sample if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') @@ -1061,6 +1155,29 @@ cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): data32[i] = (data32[i] & 0xFFFFFFFE) | phased +cdef inline bcf_sync_end(VariantRecord record): + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf_info_t *info + cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END') + cdef int ref_len = len(record.ref) + + # Delete INFO/END if no alleles are present or if rlen is equal to len(ref) + if not record.ptr.n_allele or record.ptr.rlen == ref_len: + # If INFO/END is not defined in the header, it doesn't exist in the record + if end_id >= 0: + info = bcf_get_info(hdr, record.ptr, b'END') + if info and info.vptr: + if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0: + raise ValueError('Unable to delete END') + else: + # Create END header, if not present + if end_id < 0: + record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval') + + # Update to reflect stop position + bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen) + + ######################################################################## ######################################################################## ## Variant Header objects @@ -1205,6 +1322,28 @@ cdef class VariantHeaderRecord(object): """D.values() -> list of D's values""" return list(self.itervalues()) + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + # Mappings are not hashable by default, but subclasses can change this __hash__ = None @@ -1235,9 +1374,8 @@ cdef class VariantHeaderRecord(object): cdef bcf_hrec_t *r = self.ptr if not r: return - assert(r.key) + assert r.key cdef char *key = r.key if r.type == BCF_HL_GEN else r.value - print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key)) bcf_hdr_remove(hdr, r.type, key) self.ptr = NULL @@ -1358,8 +1496,8 @@ cdef class VariantMetadata(object): def remove_header(self): cdef bcf_hdr_t *hdr = self.header.ptr - cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key - bcf_hdr_remove(hdr, self.type, bkey) + cdef const char *key = hdr.id[BCF_DT_ID][self.id].key + bcf_hdr_remove(hdr, self.type, key) cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): @@ -1437,11 +1575,11 @@ cdef class VariantHeaderMetadata(object): cdef bcf_hdr_t *hdr = self.header.ptr cdef vdict_t *d = hdr.dict[BCF_DT_ID] - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef khiter_t k = kh_get_vdict(d, bkey) if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: - raise KeyError('invalid key') + raise KeyError('invalid key: {}'.format(key)) return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) @@ -1449,11 +1587,11 @@ cdef class VariantHeaderMetadata(object): cdef bcf_hdr_t *hdr = self.header.ptr cdef vdict_t *d = hdr.dict[BCF_DT_ID] - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef khiter_t k = kh_get_vdict(d, bkey) if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: - raise KeyError('invalid key') + raise KeyError('invalid key: {}'.format(key)) bcf_hdr_remove(hdr, self.type, bkey) #bcf_hdr_sync(hdr) @@ -1555,7 +1693,7 @@ cdef class VariantContig(object): return length if length else None @property - def header(self): + def header_record(self): """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" cdef bcf_hdr_t *hdr = self.header.ptr cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] @@ -1563,8 +1701,8 @@ cdef class VariantContig(object): def remove_header(self): cdef bcf_hdr_t *hdr = self.header.ptr - cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key - bcf_hdr_remove(hdr, BCF_HL_CTG, bkey) + cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key + bcf_hdr_remove(hdr, BCF_HL_CTG, key) cdef VariantContig makeVariantContig(VariantHeader header, int id): @@ -1607,11 +1745,11 @@ cdef class VariantHeaderContigs(object): return makeVariantContig(self.header, index) cdef vdict_t *d = hdr.dict[BCF_DT_CTG] - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef khiter_t k = kh_get_vdict(d, bkey) if k == kh_end(d): - raise KeyError('invalid contig') + raise KeyError('invalid contig: {}'.format(key)) cdef int id = kh_val_vdict(d, k).id @@ -1620,7 +1758,7 @@ cdef class VariantHeaderContigs(object): def remove_header(self, key): cdef bcf_hdr_t *hdr = self.header.ptr cdef int index - cdef const char *bkey + cdef const char *ckey cdef vdict_t *d cdef khiter_t k @@ -1628,15 +1766,15 @@ cdef class VariantHeaderContigs(object): index = key if index < 0 or index >= hdr.n[BCF_DT_CTG]: raise IndexError('invalid contig index') - bkey = hdr.id[BCF_DT_CTG][self.id].key + ckey = hdr.id[BCF_DT_CTG][self.id].key else: d = hdr.dict[BCF_DT_CTG] key = force_bytes(key) if kh_get_vdict(d, key) == kh_end(d): - raise KeyError('invalid contig') - bkey = key + raise KeyError('invalid contig: {}'.format(key)) + ckey = key - bcf_hdr_remove(hdr, BCF_HL_CTG, bkey) + bcf_hdr_remove(hdr, BCF_HL_CTG, ckey) def clear_header(self): cdef bcf_hdr_t *hdr = self.header.ptr @@ -1704,7 +1842,8 @@ cdef class VariantHeaderContigs(object): if id in self: raise ValueError('Header already exists for contig {}'.format(id)) - items = [('ID', id)] + kwargs.items() + items = [('ID', id)] + items += kwargs.items() self.header.add_meta('contig', items=items) @@ -1749,7 +1888,7 @@ cdef class VariantHeaderSamples(object): def __contains__(self, key): cdef bcf_hdr_t *hdr = self.header.ptr cdef vdict_t *d = hdr.dict[BCF_DT_SAMPLE] - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef khiter_t k = kh_get_vdict(d, bkey) return k != kh_end(d) @@ -1796,7 +1935,6 @@ cdef class VariantHeader(object): self.ptr = NULL def __bool__(self): - # self.ptr == NULL should be impossible return self.ptr != NULL def copy(self): @@ -1886,11 +2024,50 @@ cdef class VariantHeader(object): finally: free(hstr) - cpdef VariantRecord new_record(self): - """Create a new empty VariantRecord""" - r = makeVariantRecord(self, bcf_init()) - r.ptr.n_sample = bcf_hdr_nsamples(self.ptr) - return r + def new_record(self, contig=None, start=0, stop=0, alleles=None, + id=None, qual=None, filter=None, info=None, samples=None, + **kwargs): + """Create a new empty VariantRecord. + + Arguments are currently experimental. Use with caution and expect + changes in upcoming releases. + + """ + rec = makeVariantRecord(self, bcf_init()) + rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr) + + if contig is not None: + rec.contig = contig + if alleles is not None: + rec.alleles = alleles + + rec.start = start + rec.stop = stop + rec.id = id + rec.qual = qual + + if filter is not None: + if isinstance(filter, (list, tuple, VariantRecordFilter)): + for f in filter: + rec.filter.add(f) + else: + rec.filter.add(filter) + + if info: + rec.info.update(info) + + if kwargs: + if 'GT' in kwargs: + rec.samples[0]['GT'] = kwargs.pop('GT') + rec.samples[0].update(kwargs) + + if samples: + for i, sample in enumerate(samples): + if 'GT' in sample: + rec.samples[i]['GT'] = sample.pop('GT') + rec.samples[i].update(sample) + + return rec def add_record(self, VariantHeaderRecord record): """Add an existing :class:`VariantHeaderRecord` to this header""" @@ -1963,6 +2140,23 @@ cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): return header +cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2: + cdef vdict_t *d + cdef khiter_t k + cdef int info_id + + if isinstance(key, str): + key = force_bytes(key) + + d = hdr.dict[BCF_DT_ID] + k = kh_get_vdict(d, key) + + if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: + return -1 + + return kh_val_vdict(d, k).id + + ######################################################################## ######################################################################## ## Variant Record objects @@ -2001,7 +2195,7 @@ cdef class VariantRecordFilter(object): id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): - raise KeyError('Invalid filter') + raise KeyError('Invalid filter: {}'.format(key)) return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) @@ -2014,11 +2208,11 @@ cdef class VariantRecordFilter(object): if key == '.': key = 'PASS' - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) if not check_header_id(hdr, BCF_HL_FLT, id): - raise KeyError('Invalid filter') + raise KeyError('Invalid filter: {}'.format(key)) bcf_add_filter(hdr, r, id) @@ -2043,7 +2237,7 @@ cdef class VariantRecordFilter(object): id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): - raise KeyError('Invalid filter') + raise KeyError('Invalid filter: {}'.format(key)) bcf_remove_filter(hdr, r, id, 0) @@ -2071,7 +2265,7 @@ cdef class VariantRecordFilter(object): def __contains__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) return bcf_has_filter(hdr, r, bkey) == 1 def iterkeys(self): @@ -2100,6 +2294,20 @@ cdef class VariantRecordFilter(object): """D.values() -> list of D's values""" return list(self.itervalues()) + def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.record.ptr + cdef bcf1_t *o = other.record.ptr + + cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other)) + + if op == 3: + cmp = not cmp + + return cmp + # Mappings are not hashable by default, but subclasses can change this __hash__ = None @@ -2146,11 +2354,11 @@ cdef class VariantRecordFormat(object): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) if not fmt or not fmt.p: - raise KeyError('unknown format') + raise KeyError('unknown format: {}'.format(key)) return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) @@ -2158,11 +2366,11 @@ cdef class VariantRecordFormat(object): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) if not fmt or not fmt.p: - raise KeyError('unknown format') + raise KeyError('unknown format: {}'.format(key)) if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0: raise ValueError('Unable to delete FORMAT') @@ -2204,7 +2412,7 @@ cdef class VariantRecordFormat(object): def __contains__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) return fmt != NULL and fmt.p != NULL @@ -2259,38 +2467,65 @@ cdef class VariantRecordInfo(object): raise TypeError('this class cannot be instantiated from Python') def __len__(self): - return self.record.ptr.n_info + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i, count = 0 + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: + count += 1 + + return count def __bool__(self): - return self.record.ptr.n_info != 0 + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: + return True + + return False def __getitem__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef vdict_t *d - cdef khiter_t k - cdef info_id if bcf_unpack(r, BCF_UN_INFO) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) - cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + cdef bytes bkey = force_bytes(key) - if not info: - d = hdr.dict[BCF_DT_ID] - k = kh_get_vdict(d, bkey) + if strcmp(bkey, b'END') == 0: + raise KeyError('END is a reserved attribute; access is via record.stop') - if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: - raise KeyError('Unknown INFO field: {}'.format(key)) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - info_id = kh_val_vdict(d, k).id - else: - info_id = info.key + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if info_id < 0: + raise KeyError('Unknown INFO field: {}'.format(key)) if not check_header_id(hdr, BCF_HL_INFO, info_id): raise ValueError('Invalid header') + # Handle type=Flag values if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: return info != NULL and info.vptr != NULL @@ -2300,18 +2535,42 @@ cdef class VariantRecordInfo(object): return bcf_info_get_value(self.record, info) def __setitem__(self, key, value): + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: + raise KeyError('END is a reserved attribute; access is via record.stop') + + if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + bcf_info_set_value(self.record, key, value) def __delitem__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr + cdef bytes bkey = force_bytes(key) + if strcmp(bkey, b'END') == 0: + raise KeyError('END is a reserved attribute; access is via record.stop') + if bcf_unpack(r, BCF_UN_INFO) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if info_id < 0: + raise KeyError('Unknown INFO field: {}'.format(key)) + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle flags + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): + return + if not info or not info.vptr: raise KeyError('Unknown INFO field: {}'.format(key)) @@ -2333,6 +2592,8 @@ cdef class VariantRecordInfo(object): info = &r.d.info[i] if info and info.vptr: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') == 0: + continue if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0: raise ValueError('Unable to delete INFO') @@ -2340,20 +2601,49 @@ cdef class VariantRecordInfo(object): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr cdef bcf_info_t *info + cdef const char *key cdef int i + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + for i in range(r.n_info): info = &r.d.info[i] if info and info.vptr: - yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)) + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') != 0: + yield bcf_str_cache_get_charptr(key) def get(self, key, default=None): """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: return default + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle flags + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: + return info != NULL and info.vptr != NULL + + if not info or not info.vptr: + return default + + return bcf_info_get_value(self.record, info) + def __contains__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr @@ -2361,10 +2651,14 @@ cdef class VariantRecordInfo(object): if bcf_unpack(r, BCF_UN_INFO) < 0: raise ValueError('Error unpacking VariantRecord') - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: + return False + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - return info != NULL + return info != NULL and info.vptr != NULL def iterkeys(self): """D.iterkeys() -> an iterator over the keys of D""" @@ -2372,28 +2666,40 @@ cdef class VariantRecordInfo(object): def itervalues(self): """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr cdef bcf_info_t *info + cdef const char *key cdef int i + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + for i in range(r.n_info): info = &r.d.info[i] if info and info.vptr: - yield bcf_info_get_value(self.record, info) + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') != 0: + yield bcf_info_get_value(self.record, info) def iteritems(self): """D.iteritems() -> an iterator over the (key, value) items of D""" cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr cdef bcf_info_t *info + cdef const char *key cdef int i + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + for i in range(r.n_info): info = &r.d.info[i] if info and info.vptr: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) - value = bcf_info_get_value(self.record, info) - yield bcf_str_cache_get_charptr(key), value + if strcmp(key, b'END') != 0: + value = bcf_info_get_value(self.record, info) + yield bcf_str_cache_get_charptr(key), value def keys(self): """D.keys() -> list of D's keys""" @@ -2407,11 +2713,75 @@ cdef class VariantRecordInfo(object): """D.values() -> list of D's values""" return list(self.itervalues()) + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + if k != 'END': + self[k] = v + + if kwargs: + kwargs.pop('END', None) + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if info_id < 0: + if default is _nothing: + raise KeyError('Unknown INFO field: {}'.format(key)) + return default + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle flags + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): + return + + if not info or not info.vptr: + if default is _nothing: + raise KeyError('Unknown INFO field: {}'.format(key)) + return default + + value = bcf_info_get_value(self.record, info) + + if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: + raise ValueError('Unable to delete INFO') + + return value + + def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.record.ptr + cdef bcf1_t *o = other.record.ptr + + # Cannot use n_info as shortcut logic, since null values may remain + cdef bint cmp = dict(self) == dict(other) + + if op == 3: + cmp = not cmp + + return cmp + # Mappings are not hashable by default, but subclasses can change this __hash__ = None - #TODO: implement __richcmp__ - cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): if not record: @@ -2429,15 +2799,15 @@ cdef class VariantRecordSamples(object): raise TypeError('this class cannot be instantiated from Python') def __len__(self): - return bcf_hdr_nsamples(self.record.header.ptr) + return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr) def __bool__(self): - return bcf_hdr_nsamples(self.record.header.ptr) != 0 + return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0 def __getitem__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef int n = bcf_hdr_nsamples(hdr) + cdef int n = self.record.ptr.n_sample cdef int sample_index cdef vdict_t *d cdef khiter_t k @@ -2448,7 +2818,7 @@ cdef class VariantRecordSamples(object): bkey = force_bytes(key) sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) if sample_index < 0: - raise KeyError('invalid sample name') + raise KeyError('invalid sample name: {}'.format(key)) if sample_index < 0 or sample_index >= n: raise IndexError('invalid sample index') @@ -2458,7 +2828,7 @@ cdef class VariantRecordSamples(object): def __iter__(self): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) + cdef int32_t i, n = self.record.ptr.n_sample for i in range(n): yield charptr_to_str(hdr.samples[i]) @@ -2473,7 +2843,7 @@ cdef class VariantRecordSamples(object): def __contains__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef int n = bcf_hdr_nsamples(hdr) + cdef int n = self.record.ptr.n_sample cdef int sample_index cdef vdict_t *d cdef khiter_t k @@ -2484,7 +2854,7 @@ cdef class VariantRecordSamples(object): bkey = force_bytes(key) sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) if sample_index < 0: - raise KeyError('invalid sample name') + raise KeyError('invalid sample name: {}'.format(key)) return 0 <= sample_index < n @@ -2496,7 +2866,7 @@ cdef class VariantRecordSamples(object): """D.itervalues() -> an iterator over the values of D""" cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) + cdef int32_t i, n = self.record.ptr.n_sample for i in range(n): yield makeVariantRecordSample(self.record, i) @@ -2505,7 +2875,7 @@ cdef class VariantRecordSamples(object): """D.iteritems() -> an iterator over the (key, value) items of D""" cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) + cdef int32_t i, n = self.record.ptr.n_sample for i in range(n): yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i)) @@ -2522,11 +2892,45 @@ cdef class VariantRecordSamples(object): """D.values() -> list of D's values""" return list(self.itervalues()) + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + + def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.record.ptr + cdef bcf1_t *o = other.record.ptr + + cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values()) + + if op == 3: + cmp = not cmp + + return cmp + # Mappings are not hashable by default, but subclasses can change this __hash__ = None - #TODO: implement __richcmp__ - cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): if not record: @@ -2566,6 +2970,7 @@ cdef class VariantRecord(object): raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr))) bcf_translate(dst_hdr, src_hdr, self.ptr) + self.header = dst_header @property def rid(self): @@ -2627,6 +3032,7 @@ cdef class VariantRecord(object): if p < 1: raise ValueError('Position must be positive') self.ptr.pos = p - 1 + bcf_sync_end(self) @property def start(self): @@ -2639,6 +3045,7 @@ cdef class VariantRecord(object): if s < 0: raise ValueError('Start coordinate must be non-negative') self.ptr.pos = s + bcf_sync_end(self) @property def stop(self): @@ -2648,25 +3055,21 @@ cdef class VariantRecord(object): @stop.setter def stop(self, value): cdef int s = value - if s < self.ptr.pos: - raise ValueError('Stop coordinate must be greater than or equal to start') + if s < 0: + raise ValueError('Stop coordinate must be non-negative') self.ptr.rlen = s - self.ptr.pos - if self.ptr.rlen != len(self.ref) or 'END' in self.info: - self.info['END'] = s + bcf_sync_end(self) @property def rlen(self): - """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)""" + """record length on chrom/contig (aka rec.stop - rec.start)""" return self.ptr.rlen @rlen.setter def rlen(self, value): cdef int r = value - if r < 0: - raise ValueError('Reference length must be non-negative') self.ptr.rlen = r - if r != len(self.ref) or 'END' in self.info: - self.info['END'] = self.ptr.pos + r + bcf_sync_end(self) @property def qual(self): @@ -2732,6 +3135,8 @@ cdef class VariantRecord(object): else: alleles = [value] self.alleles = alleles + self.ptr.rlen = len(value) + bcf_sync_end(self) @property def alleles(self): @@ -2749,17 +3154,28 @@ cdef class VariantRecord(object): return res @alleles.setter - def alleles(self, value): + def alleles(self, values): cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: raise ValueError('Error unpacking VariantRecord') - value = [force_bytes(v) for v in value] - if b'' in value: + + values = [force_bytes(v) for v in values] + + if len(values) < 2: + raise ValueError('must set at least 2 alleles') + + if b'' in values: raise ValueError('cannot set null allele') - value = b','.join(value) + + value = b','.join(values) + if bcf_update_alleles_str(self.header.ptr, r, value) < 0: raise ValueError('Error updating alleles') + self.ptr.rlen = len(values[0]) + bcf_sync_end(self) + @property def alts(self): """tuple of alt alleles""" @@ -2815,6 +3231,32 @@ cdef class VariantRecord(object): raise ValueError('Error unpacking VariantRecord') return makeVariantRecordSamples(self) + def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.ptr + cdef bcf1_t *o = other.ptr + + cdef bint cmp = self is other or ( + s.pos == o.pos + and s.rlen == o.rlen + and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual)) + or s.qual == o.qual) + and s.n_sample == o.n_sample + and s.n_allele == o.n_allele + and self.contig == other.contig + and self.alleles == other.alleles + and self.id == other.id + and self.info == other.info + and self.filter == other.filter + and self.samples == other.samples) + + if op == 3: + cmp = not cmp + + return cmp + def __str__(self): cdef kstring_t line cdef char c @@ -2896,7 +3338,7 @@ cdef class VariantRecordSample(object): """sample name""" cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef int32_t n = r.n_sample if self.index < 0 or self.index >= n: raise ValueError('invalid sample index') @@ -3006,7 +3448,7 @@ cdef class VariantRecordSample(object): def __contains__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) + cdef bytes bkey = force_bytes(key) cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) return fmt != NULL and fmt.p != NULL @@ -3036,11 +3478,42 @@ cdef class VariantRecordSample(object): """D.values() -> list of D's values""" return list(self.itervalues()) + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + + def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bint cmp = dict(self) == dict(other) + + if op == 3: + cmp = not cmp + + return cmp + # Mappings are not hashable by default, but subclasses can change this __hash__ = None - #TODO: implement __richcmp__ - cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): if not record or sample_index < 0: @@ -3120,6 +3593,28 @@ cdef class BaseIndex(object): """D.values() -> list of D's values""" return list(self.itervalues()) + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + # Mappings are not hashable by default, but subclasses can change this __hash__ = None @@ -3253,7 +3748,7 @@ cdef class BCFIterator(BaseIterator): try: rid = index.refmap[contig] except KeyError: - raise ValueError('Unknown contig specified') + raise ValueError('Unknown contig specified: {}'.format(contig)) if start is None: start = 0 @@ -3409,7 +3904,7 @@ cdef class TabixIterator(BaseIterator): cdef class VariantFile(HTSFile): """*(filename, mode=None, index_filename=None, header=None, drop_samples=False, - duplicate_filehandle=True)* + duplicate_filehandle=True, ignore_truncation=False)* A :term:`VCF`/:term:`BCF` formatted file. The file is automatically opened. @@ -3451,7 +3946,7 @@ cdef class VariantFile(HTSFile): drop_samples: bool Ignore sample information when reading. - duplicate_filehandle: bool + duplicate_filehandle: bool By default, file handles passed either directly or through File-like objects will be duplicated before passing them to htslib. The duplication prevents issues where the same stream @@ -3459,6 +3954,11 @@ cdef class VariantFile(HTSFile): high-level python object. Set to False to turn off duplication. + ignore_truncation: bool + Issue a warning, instead of raising an error if the current file + appears to be truncated due to a missing EOF marker. Only applies + to bgzipped formats. (Default=False) + """ def __cinit__(self, *args, **kwargs): self.htsfile = NULL @@ -3478,19 +3978,39 @@ cdef class VariantFile(HTSFile): self.open(*args, **kwargs) + def __dealloc__(self): + if not self.htsfile or not self.header: + return + + # Write header if no records were written + if self.htsfile.is_write and not self.header_written: + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + cdef int ret = hts_close(self.htsfile) + self.htsfile = NULL + self.header = self.index = None + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) + def close(self): """closes the :class:`pysam.VariantFile`.""" - cdef int ret = 0 - self.header = self.index = None - if self.htsfile: - # Write header if no records were written - if self.htsfile.is_write and not self.header_written: - self.header_written = True - with nogil: - bcf_hdr_write(self.htsfile, self.header.ptr) + if not self.htsfile: + return - ret = hts_close(self.htsfile) - self.htsfile = NULL + # Write header if no records were written + if self.htsfile.is_write and not self.header_written: + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + cdef int ret = hts_close(self.htsfile) + self.htsfile = NULL + self.header = self.index = None if ret < 0: global errno @@ -3525,7 +4045,7 @@ cdef class VariantFile(HTSFile): if ret == -1: raise StopIteration elif ret == -2: - raise IOError('truncated file') + raise OSError('truncated file') else: raise ValueError('Variant read failed') @@ -3572,7 +4092,8 @@ cdef class VariantFile(HTSFile): index_filename=None, VariantHeader header=None, drop_samples=False, - duplicate_filehandle=True): + duplicate_filehandle=True, + ignore_truncation=False): """open a vcf/bcf file. If open is called on an existing VariantFile, the current file will be @@ -3656,7 +4177,6 @@ cdef class VariantFile(HTSFile): elif mode.startswith(b'r'): # open file for reading - if not self._exists(): raise IOError('file `{}` not found'.format(filename)) @@ -3668,10 +4188,7 @@ cdef class VariantFile(HTSFile): if self.htsfile.format.format not in (bcf, vcf): raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode)) - if self.htsfile.format.compression == bgzf: - bgzfp = hts_get_bgzfp(self.htsfile) - if bgzfp and bgzf_check_EOF(bgzfp) == 0: - warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename)) + self.check_truncation(ignore_truncation) with nogil: hdr = bcf_hdr_read(self.htsfile) @@ -3710,7 +4227,6 @@ cdef class VariantFile(HTSFile): """reset file position to beginning of file just after the header.""" return self.seek(self.start_offset) - def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False): """fetch records in a :term:`region` using 0-based indexing. The region is specified by :term:`contig`, *start* and *end*. @@ -3750,9 +4266,12 @@ cdef class VariantFile(HTSFile): self.is_reading = 1 return self.index.fetch(self, contig, start, stop, region, reopen) - cpdef VariantRecord new_record(self): - """Create a new empty VariantRecord""" - return self.header.new_record() + def new_record(self, *args, **kwargs): + """Create a new empty :class:`VariantRecord`. + + See :meth:`VariantHeader.new_record` + """ + return self.header.new_record(*args, **kwargs) cpdef int write(self, VariantRecord record) except -1: """ @@ -3782,6 +4301,9 @@ cdef class VariantFile(HTSFile): msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})' raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr))) + # Sync END annotation before writing + bcf_sync_end(record) + cdef int ret with nogil: diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd new file mode 100644 index 0000000..7c8e632 --- /dev/null +++ b/pysam/libcbcftools.pxd @@ -0,0 +1,3 @@ +cdef extern from "cbcftools_util.h": + + int bcftools_main(int argc, char *argv[]) diff --git a/pysam/libcbcftools.pyx b/pysam/libcbcftools.pyx new file mode 100644 index 0000000..8e90388 --- /dev/null +++ b/pysam/libcbcftools.pyx @@ -0,0 +1,2 @@ +def py_bcftools(): + pass diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx index 558ceff..f1d2fa9 100644 --- a/pysam/libcbgzf.pyx +++ b/pysam/libcbgzf.pyx @@ -14,9 +14,10 @@ from libc.stdlib cimport malloc, calloc, realloc, free from cpython.object cimport PyObject from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize -from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len -from pysam.libchtslib cimport * - +from pysam.libcutils cimport force_bytes, encode_filename +from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \ + bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \ + bgzf_tell, bgzf_getline, kstring_t, SEEK_SET, BGZF __all__ = ["BGZFile"] @@ -32,7 +33,7 @@ cdef class BGZFile(object): compressed file in text mode, use the gzip.open() function. """ cdef BGZF* bgzf - cdef bytes name, index + cdef readonly object name, index def __init__(self, filename, mode=None, index=None): """Constructor for the BGZFile class. @@ -47,10 +48,14 @@ cdef class BGZFile(object): raise ValueError("Invalid mode: {!r}".format(mode)) if not mode: mode = 'rb' - if mode and 'b' not in mode: + elif mode and 'b' not in mode: mode += 'b' - self.name = force_bytes(filename) - self.index = force_bytes(index) if index is not None else None + + mode = force_bytes(mode) + + self.name = encode_filename(filename) + self.index = encode_filename(index) if index is not None else None + self.bgzf = bgzf_open(self.name, mode) if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0: @@ -59,7 +64,7 @@ cdef class BGZFile(object): def __dealloc__(self): self.close() - def write(self,data): + def write(self, data): if not self.bgzf: raise ValueError("write() on closed BGZFile object") @@ -177,6 +182,15 @@ cdef class BGZFile(object): def seekable(self): return True + def tell(self): + if not self.bgzf: + raise ValueError("seek() on closed BGZFile object") + cdef int64_t off = bgzf_tell(self.bgzf) + if off < 0: + raise IOError('Error in tell on BGZFFile object') + + return off + def seek(self, offset, whence=io.SEEK_SET): if not self.bgzf: raise ValueError("seek() on closed BGZFile object") @@ -198,12 +212,27 @@ cdef class BGZFile(object): line.l = line.m = 0 line.s = NULL - if bgzf_getline(self.bgzf, '\n', &line) < 0: - raise IOError('Error reading line in BGZFFile object') - ret = charptr_to_str_w_len(line.s, line.l) + cdef int ret = bgzf_getline(self.bgzf, '\n', &line) + if ret == -1: + s = b'' + elif ret == -2: + if line.m: + free(line.s) + raise IOError('Error reading line in BGZFFile object') + else: + s = line.s[:line.l] if line.m: free(line.s) - return ret + return s + + def __iter__(self): + return self + + def __next__(self): + line = self.readline() + if not line: + raise StopIteration() + return line diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index 774152d..3af76f6 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -59,7 +59,7 @@ from cpython.version cimport PY_MAJOR_VERSION from pysam.libchtslib cimport \ faidx_nseq, fai_load, fai_destroy, fai_fetch, \ - faidx_seq_len, \ + faidx_seq_len, faidx_iseq, faidx_seq_len, \ faidx_fetch_seq, hisremote, \ bgzf_open, bgzf_close @@ -154,21 +154,17 @@ cdef class FastaFile: if self.fastafile == NULL: raise IOError("could not open file `%s`" % filename) - if self.is_remote: - filepath_index = os.path.basename( - re.sub("[^:]+:[/]*", "", filename)) + ".fai" - elif filepath_index is None: - filepath_index = filename + ".fai" - - if not os.path.exists(filepath_index): - raise ValueError("could not locate index file {}".format( - filepath_index)) - - with open(filepath_index) as inf: - data = [x.split("\t") for x in inf] - self._references = tuple(x[0] for x in data) - self._lengths = tuple(int(x[1]) for x in data) - self.reference2length = dict(zip(self._references, self._lengths)) + cdef int nreferences = faidx_nseq(self.fastafile) + cdef int x + cdef const char * s + self._references = [] + self._lengths = [] + for x from 0 <= x < nreferences: + s = faidx_iseq(self.fastafile, x) + ss = force_str(s) + self._references.append(ss) + self._lengths.append(faidx_seq_len(self.fastafile, s)) + self.reference2length = dict(zip(self._references, self._lengths)) def close(self): """close the file.""" @@ -447,6 +443,9 @@ cdef class FastxFile: ... print(entry.sequence) ... print(entry.comment) ... print(entry.quality) + >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout: + ... for entry in fin: + ... fout.write(str(entry)) """ def __cinit__(self, *args, **kwargs): diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 657a754..78a55f8 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -9,6 +9,12 @@ cdef extern from "Python.h": FILE* PyFile_AsFile(object) +# cython does not wrap stdarg +cdef extern from "stdarg.h": + ctypedef struct va_list: + pass + + cdef extern from "htslib/kstring.h" nogil: ctypedef struct kstring_t: size_t l, m @@ -54,7 +60,7 @@ cdef extern from "htslib/hfile.h" nogil: # @abstract Open the named file or URL as a stream # @return An hFILE pointer, or NULL (with errno set) if an error occurred. - hFILE *hopen(const char *filename, const char *mode) + hFILE *hopen(const char *filename, const char *mode, ...) # @abstract Associate a stream with an existing open file descriptor # @return An hFILE pointer, or NULL (with errno set) if an error occurred. @@ -97,6 +103,40 @@ cdef extern from "htslib/hfile.h" nogil: # @return The character read, or EOF on end-of-file or error int hgetc(hFILE *fp) + # Read from the stream until the delimiter, up to a maximum length + # @param buffer The buffer into which bytes will be written + # @param size The size of the buffer + # @param delim The delimiter (interpreted as an `unsigned char`) + # @param fp The file stream + # @return The number of bytes read, or negative on error. + # @since 1.4 + # + # Bytes will be read into the buffer up to and including a delimiter, until + # EOF is reached, or _size-1_ bytes have been written, whichever comes first. + # The string will then be terminated with a NUL byte (`\0`). + ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) + + # Read a line from the stream, up to a maximum length + # @param buffer The buffer into which bytes will be written + # @param size The size of the buffer + # @param fp The file stream + # @return The number of bytes read, or negative on error. + # @since 1.4 + # + # Specialization of hgetdelim() for a `\n` delimiter. + ssize_t hgetln(char *buffer, size_t size, hFILE *fp) + + # Read a line from the stream, up to a maximum length + # @param buffer The buffer into which bytes will be written + # @param size The size of the buffer (must be > 1 to be useful) + # @param fp The file stream + # @return _buffer_ on success, or `NULL` if an error occurred. + # @since 1.4 + # + # This function can be used as a replacement for `fgets(3)`, or together with + # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. + char *hgets(char *buffer, int size, hFILE *fp) + # @abstract Peek at characters to be read without removing them from buffers # @param fp The file stream # @param buffer The buffer to which the peeked bytes will be written @@ -623,7 +663,7 @@ cdef extern from "htslib/hts.h" nogil: # @return The index, or NULL if an error occurred. hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) - uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta) + uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) int hts_idx_get_stat(const hts_idx_t* idx, int tid, @@ -694,6 +734,79 @@ cdef extern from "htslib/hts.h" nogil: int hts_file_type(const char *fname) + # /*************************** + # * Revised MAQ error model * + # ***************************/ + + ctypedef struct errmod_t + + errmod_t *errmod_init(double depcorr) + void errmod_destroy(errmod_t *em) + + # /* + # n: number of bases + # m: maximum base + # bases[i]: qual:6, strand:1, base:4 + # q[i*m+j]: phred-scaled likelihood of (i,j) + # */ + int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic) + + # /***************************************** + # * q banded glocal alignment * + # *****************************************/ + + ctypedef struct probaln_par_t: + float d, e + int bw; + + int probaln_glocal(const uint8_t *ref, + int l_ref, + const uint8_t *query, + int l_query, const uint8_t *iqual, + const probaln_par_t *c, + int *state, uint8_t *q) + + # /********************** + # * MD5 implementation * + # **********************/ + + ctypedef struct hts_md5_context + + # /*! @abstract Intialises an MD5 context. + # * @discussion + # * The expected use is to allocate an hts_md5_context using + # * hts_md5_init(). This pointer is then passed into one or more calls + # * of hts_md5_update() to compute successive internal portions of the + # * MD5 sum, which can then be externalised as a full 16-byte MD5sum + # * calculation by calling hts_md5_final(). This can then be turned + # * into ASCII via hts_md5_hex(). + # * + # * To dealloate any resources created by hts_md5_init() call the + # * hts_md5_destroy() function. + # * + # * @return hts_md5_context pointer on success, NULL otherwise. + # */ + hts_md5_context *hts_md5_init() + + # /*! @abstract Updates the context with the MD5 of the data. */ + void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size) + + # /*! @abstract Computes the final 128-bit MD5 hash from the given context */ + void hts_md5_final(unsigned char *digest, hts_md5_context *ctx) + + # /*! @abstract Resets an md5_context to the initial state, as returned + # * by hts_md5_init(). + # */ + void hts_md5_reset(hts_md5_context *ctx) + + # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated + # * hex string. + # */ + void hts_md5_hex(char *hex, const unsigned char *digest) + + # /*! @abstract Deallocates any memory allocated by hts_md5_init. */ + void hts_md5_destroy(hts_md5_context *ctx) + inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) inline int hts_bin_bot(int bin, int n_lvls) @@ -803,7 +916,9 @@ cdef extern from "htslib/sam.h" nogil: uint8_t qual uint8_t l_qname uint16_t flag - uint16_t n_cigar + uint8_t unused1 + uint8_t l_extranul + uint32_t n_cigar int32_t l_qseq int32_t mtid int32_t mpos @@ -999,7 +1114,7 @@ cdef extern from "htslib/sam.h" nogil: #************************************* uint8_t *bam_aux_get(const bam1_t *b, const char *tag) - int32_t bam_aux2i(const uint8_t *s) + int64_t bam_aux2i(const uint8_t *s) double bam_aux2f(const uint8_t *s) char bam_aux2A(const uint8_t *s) char *bam_aux2Z(const uint8_t *s) @@ -1011,6 +1126,18 @@ cdef extern from "htslib/sam.h" nogil: #*** Pileup and Mpileup *** #************************** + # @abstract Generic pileup 'client data'. + # @discussion The pileup iterator allows setting a constructor and + # destructor function, which will be called every time a sequence is + # fetched and discarded. This permits caching of per-sequence data in + # a tidy manner during the pileup process. This union is the cached + # data to be manipulated by the "client" (the caller of pileup). + # + union bam_pileup_cd: + void *p + int64_t i + double f + # @abstract Structure for one alignment covering the pileup position. # @field b pointer to the alignment # @field qpos position of the read base at the pileup site, 0-based @@ -1041,6 +1168,7 @@ cdef extern from "htslib/sam.h" nogil: uint32_t is_tail uint32_t is_refskip uint32_t aux + bam_pileup_cd cd ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) ctypedef int (*bam_test_f)() @@ -1079,34 +1207,116 @@ cdef extern from "htslib/sam.h" nogil: # Added by AH # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" + # *********************************** + # * BAQ calculation and realignment * + # ***********************************/ + int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) + int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) + cdef extern from "htslib/faidx.h" nogil: ctypedef struct faidx_t: pass + # /// Build index for a FASTA or bgzip-compressed FASTA file. + # /** @param fn FASTA file name + # @param fnfai Name of .fai file to build. + # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). + # @return 0 on success; or -1 on failure + + # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. + # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI + # file will only be built if fn is bgzip-compressed. + # */ + int fai_build3(const char *fn, + const char *fnfai, + const char *fngzi) + + # /// Build index for a FASTA or bgzip-compressed FASTA file. + # /** @param fn FASTA file name + # @return 0 on success; or -1 on failure + # + # File "fn.fai" will be generated. This function is equivalent to + # fai_build3(fn, NULL, NULL); + # */ int fai_build(char *fn) + # /// Destroy a faidx_t struct void fai_destroy(faidx_t *fai) + # /// Load FASTA indexes. + # /** @param fn File name of the FASTA file (can be compressed with bgzip). + # @param fnfai File name of the FASTA index. + # @param fngzi File name of the bgzip index. + # @param flags Option flags to control index file caching and creation. + # @return Pointer to a faidx_t struct on success, NULL on failure. + + # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. + # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. + # The bgzip index is only needed if fn is compressed. + + # If (flags & FAI_CREATE) is true, the index files will be built using + # fai_build3() if they are not already present. + # */ + faidx_t *fai_load3(const char *fn, + const char *fnfai, + const char *fngzi, + int flags) + + # /// Load index from "fn.fai". + # /** @param fn File name of the FASTA file + # @return Pointer to a faidx_t struct on success, NULL on failure. + # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); + # */ faidx_t *fai_load(char *fn) + # /// Fetch the sequence in a region + # /** @param fai Pointer to the faidx_t struct + # @param reg Region in the format "chr2:20,000-30,000" + # @param len Length of the region; -2 if seq not present, -1 general error + # @return Pointer to the sequence; `NULL` on failure + # The returned sequence is allocated by `malloc()` family and should be destroyed + # by end users by calling `free()` on it. + # */ char *fai_fetch(faidx_t *fai, char *reg, int *len) - int faidx_nseq(faidx_t *fai) - - int faidx_has_seq(faidx_t *fai, const char *seq) - + # /// Fetch the sequence in a region + # /** @param fai Pointer to the faidx_t struct + # @param c_name Region name + # @param p_beg_i Beginning position number (zero-based) + # @param p_end_i End position number (zero-based) + # @param len Length of the region; -2 if c_name not present, -1 general error + # @return Pointer to the sequence; null on failure + # The returned sequence is allocated by `malloc()` family and should be destroyed + # by end users by calling `free()` on it. + # */ char *faidx_fetch_seq(faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) - int faidx_seq_len(faidx_t *fai, const char *seq) + # /// Query if sequence is present + # /** @param fai Pointer to the faidx_t struct + # @param seq Sequence name + # @return 1 if present or 0 if absent + # */ + int faidx_has_seq(faidx_t *fai, const char *seq) + + # /// Fetch the number of sequences + # /** @param fai Pointer to the faidx_t struct + # @return The number of sequences + # */ + int faidx_nseq(const faidx_t *fai) + # /// Return name of i-th sequence + const char *faidx_iseq(const faidx_t *fai, int i) + + # /// Return sequence length, -1 if not present + int faidx_seq_len(faidx_t *fai, const char *seq) # tabix support cdef extern from "htslib/tbx.h" nogil: @@ -1695,7 +1905,7 @@ cdef extern from "htslib/vcf.h" nogil: int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst) int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst) int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst) - int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst) + int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst) int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst) int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) @@ -1901,6 +2111,455 @@ cdef extern from "htslib/vcfutils.h" nogil: uint32_t bcf_ij2G(uint32_t i, uint32_t j) +cdef extern from "htslib/cram.h" nogil: + + enum cram_block_method: + ERROR + RAW + GZIP + BZIP2 + LZMA + RANS + RANS0 + RANS1 + GZIP_RLE + + enum cram_content_type: + CT_ERROR + FILE_HEADER + COMPRESSION_HEADER + MAPPED_SLICE + UNMAPPED_SLICE + EXTERNAL + CORE + + # Opaque data types, see cram_structs for the fully fledged versions. + ctypedef struct SAM_hdr + ctypedef struct cram_file_def + ctypedef struct cram_fd + ctypedef struct cram_container + ctypedef struct cram_block + ctypedef struct cram_slice + ctypedef struct cram_metrics + ctypedef struct cram_block_slice_hdr + ctypedef struct cram_block_compression_hdr + ctypedef struct refs_t + + # Accessor functions + + # + #----------------------------------------------------------------------------- + # cram_fd + # + SAM_hdr *cram_fd_get_header(cram_fd *fd) + void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) + + int cram_fd_get_version(cram_fd *fd) + void cram_fd_set_version(cram_fd *fd, int vers) + + int cram_major_vers(cram_fd *fd) + int cram_minor_vers(cram_fd *fd) + + hFILE *cram_fd_get_fp(cram_fd *fd) + void cram_fd_set_fp(cram_fd *fd, hFILE *fp) + + # + #----------------------------------------------------------------------------- + # cram_container + # + int32_t cram_container_get_length(cram_container *c) + void cram_container_set_length(cram_container *c, int32_t length) + int32_t cram_container_get_num_blocks(cram_container *c) + void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) + int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) + void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, + int32_t *landmarks) + + # Returns true if the container is empty (EOF marker) */ + int cram_container_is_empty(cram_fd *fd) + + + # + #----------------------------------------------------------------------------- + # cram_block + # + int32_t cram_block_get_content_id(cram_block *b) + int32_t cram_block_get_comp_size(cram_block *b) + int32_t cram_block_get_uncomp_size(cram_block *b) + int32_t cram_block_get_crc32(cram_block *b) + void * cram_block_get_data(cram_block *b) + + cram_content_type cram_block_get_content_type(cram_block *b) + + void cram_block_set_content_id(cram_block *b, int32_t id) + void cram_block_set_comp_size(cram_block *b, int32_t size) + void cram_block_set_uncomp_size(cram_block *b, int32_t size) + void cram_block_set_crc32(cram_block *b, int32_t crc) + void cram_block_set_data(cram_block *b, void *data) + + int cram_block_append(cram_block *b, void *data, int size) + void cram_block_update_size(cram_block *b) + + # Offset is known as "size" internally, but it can be confusing. + size_t cram_block_get_offset(cram_block *b) + void cram_block_set_offset(cram_block *b, size_t offset) + + # + # Computes the size of a cram block, including the block + # header itself. + # + uint32_t cram_block_size(cram_block *b) + + # + # Renumbers RG numbers in a cram compression header. + # + # CRAM stores RG as the Nth number in the header, rather than a + # string holding the ID: tag. This is smaller in space, but means + # "samtools cat" to join files together that contain single but + # different RG lines needs a way of renumbering them. + # + # The file descriptor is expected to be immediately after the + # cram_container structure (ie before the cram compression header). + # Due to the nature of the CRAM format, this needs to read and write + # the blocks itself. Note that there may be multiple slices within + # the container, meaning multiple compression headers to manipulate. + # Changing RG may change the size of the compression header and + # therefore the length field in the container. Hence we rewrite all + # blocks just incase and also emit the adjusted container. + # + # The current implementation can only cope with renumbering a single + # RG (and only then if it is using HUFFMAN or BETA codecs). In + # theory it *may* be possible to renumber multiple RGs if they use + # HUFFMAN to the CORE block or use an external block unshared by any + # other data series. So we have an API that can be upgraded to + # support this, but do not implement it for now. An example + # implementation of RG as an EXTERNAL block would be to find that + # block and rewrite it, returning the number of blocks consumed. + # + # Returns 0 on success; + # -1 if unable to edit; + # -2 on other errors (eg I/O). + # + int cram_transcode_rg(cram_fd *input, cram_fd *output, + cram_container *c, + int nrg, int *in_rg, int *out_rg) + + # + # Copies the blocks representing the next num_slice slices from a + # container from 'in' to 'out'. It is expected that the file pointer + # is just after the read of the cram_container and cram compression + # header. + # + # Returns 0 on success + # -1 on failure + # + int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice) + + # + #----------------------------------------------------------------------------- + # SAM_hdr + # + + # Tokenises a SAM header into a hash table. + # + # Also extracts a few bits on specific data types, such as @RG lines. + # + # @return + # Returns a SAM_hdr struct on success (free with sam_hdr_free()) + # NULL on failure + # + SAM_hdr *sam_hdr_parse_(const char *hdr, int len) + + + # + #----------------------------------------------------------------------------- + # cram_io basics + # + + # CRAM blocks - the dynamically growable data block. We have code to + # create, update, (un)compress and read/write. + # + # These are derived from the deflate_interlaced.c blocks, but with the + # CRAM extension of content types and IDs. + # + + # Allocates a new cram_block structure with a specified content_type and + # id. + # + # @return + # Returns block pointer on success; + # NULL on failure + # + cram_block *cram_new_block(cram_content_type content_type, + int content_id) + + # Reads a block from a cram file. + # + # @return + # Returns cram_block pointer on success; + # NULL on failure + # + cram_block *cram_read_block(cram_fd *fd) + + # Writes a CRAM block. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_write_block(cram_fd *fd, cram_block *b) + + # Frees a CRAM block, deallocating internal data too. + # + void cram_free_block(cram_block *b) + + # Uncompresses a CRAM block, if compressed. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_uncompress_block(cram_block *b) + + # Compresses a block. + # + # Compresses a block using one of two different zlib strategies. If we only + # want one choice set strat2 to be -1. + # + # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED + # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is + # significantly faster. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, + int method, int level) + + # Containers + # + + # Creates a new container, specifying the maximum number of slices + # and records permitted. + # + # @return + # Returns cram_container ptr on success; + # NULL on failure + # + cram_container *cram_new_container(int nrec, int nslice) + void cram_free_container(cram_container *c) + + # Reads a container header. + # + # @return + # Returns cram_container on success; + # NULL on failure or no container left (fd->err == 0). + # + cram_container *cram_read_container(cram_fd *fd) + + # Writes a container structure. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_write_container(cram_fd *fd, cram_container *h) + + # + # Stores the container structure in dat and returns *size as the + # number of bytes written to dat[]. The input size of dat is also + # held in *size and should be initialised to cram_container_size(c). + # + # Returns 0 on success; + # -1 on failure + # + int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) + + int cram_container_size(cram_container *c) + + # The top-level cram opening, closing and option handling + # + + # Opens a CRAM file for read (mode "rb") or write ("wb"). + # + # The filename may be "-" to indicate stdin or stdout. + # + # @return + # Returns file handle on success; + # NULL on failure. + # + cram_fd *cram_open(const char *filename, const char *mode) + + # Opens an existing stream for reading or writing. + # + # @return + # Returns file handle on success; + # NULL on failure. + # + cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) + + # Closes a CRAM file. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_close(cram_fd *fd) + + # + # Seek within a CRAM file. + # + # Returns 0 on success + # -1 on failure + # + int cram_seek(cram_fd *fd, off_t offset, int whence) + + # + # Flushes a CRAM file. + # Useful for when writing to stdout without wishing to close the stream. + # + # Returns 0 on success + # -1 on failure + # + int cram_flush(cram_fd *fd) + + # Checks for end of file on a cram_fd stream. + # + # @return + # Returns 0 if not at end of file + # 1 if we hit an expected EOF (end of range or EOF block) + # 2 for other EOF (end of stream without EOF block) + # + int cram_eof(cram_fd *fd) + + # Sets options on the cram_fd. + # + # See CRAM_OPT_* definitions in hts.h. + # Use this immediately after opening. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...) + + # Sets options on the cram_fd. + # + # See CRAM_OPT_* definitions in hts.h. + # Use this immediately after opening. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args) + + # + # Attaches a header to a cram_fd. + # + # This should be used when creating a new cram_fd for writing where + # we have an SAM_hdr already constructed (eg from a file we've read + # in). + # + # @return + # Returns 0 on success; + # -1 on failure + # + int cram_set_header(cram_fd *fd, SAM_hdr *hdr) + + # Check if this file has a proper EOF block + # + # @return + # Returns 3 if the file is a version of CRAM that does not contain EOF blocks + # 2 if the file is a stream and thus unseekable + # 1 if the file contains an EOF block + # 0 if the file does not contain an EOF block + # -1 if an error occured whilst reading the file or we could not seek back to where we were + # + # + int cram_check_EOF(cram_fd *fd) + + # As int32_decoded/encode, but from/to blocks instead of cram_fd */ + int int32_put_blk(cram_block *b, int32_t val) + + # Deallocates all storage used by a SAM_hdr struct. + # + # This also decrements the header reference count. If after decrementing + # it is still non-zero then the header is assumed to be in use by another + # caller and the free is not done. + # + # This is a synonym for sam_hdr_dec_ref(). + # + void sam_hdr_free(SAM_hdr *hdr) + + # Returns the current length of the SAM_hdr in text form. + # + # Call sam_hdr_rebuild() first if editing has taken place. + # + int sam_hdr_length(SAM_hdr *hdr) + + # Returns the string form of the SAM_hdr. + # + # Call sam_hdr_rebuild() first if editing has taken place. + # + char *sam_hdr_str(SAM_hdr *hdr) + + # Appends a formatted line to an existing SAM header. + # + # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with + # optional new-line. If it contains more than 1 line then multiple lines + # will be added in order. + # + # Len is the length of the text data, or 0 if unknown (in which case + # it should be null terminated). + # + # @return + # Returns 0 on success; + # -1 on failure + # + + # Add an @PG line. + # + # If we wish complete control over this use sam_hdr_add() directly. This + # function uses that, but attempts to do a lot of tedious house work for + # you too. + # + # - It will generate a suitable ID if the supplied one clashes. + # - It will generate multiple @PG records if we have multiple PG chains. + # + # Call it as per sam_hdr_add() with a series of key,value pairs ending + # in NULL. + # + # @return + # Returns 0 on success; + # -1 on failure + # + int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) + + # + # A function to help with construction of CL tags in @PG records. + # Takes an argc, argv pair and returns a single space-separated string. + # This string should be deallocated by the calling function. + # + # @return + # Returns malloced char * on success; + # NULL on failure + # + char *stringify_argv(int argc, char *argv[]) + + # + # Returns the refs_t structure used by a cram file handle. + # + # This may be used in conjunction with option CRAM_OPT_SHARED_REF to + # share reference memory between multiple file handles. + # + # @return + # Returns NULL if none exists or the file handle is not a CRAM file. + # + refs_t *cram_get_refs(htsFile *fd) + + cdef class HTSFile(object): cdef htsFile *htsfile # pointer to htsFile structure cdef int64_t start_offset # BGZF offset of first record diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 7eea059..4b8d9c0 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -2,8 +2,11 @@ # cython: profile=True # adds doc-strings for sphinx import os +import io from posix.unistd cimport dup +from libc.errno cimport errno +from cpython cimport PyBytes_FromStringAndSize from pysam.libchtslib cimport * @@ -11,15 +14,24 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_ from pysam.libcutils cimport encode_filename, from_string_and_size -__all__ = ["get_verbosity", "set_verbosity"] +from warnings import warn +__all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile'] + +# defines imported from samtools +DEF SEEK_SET = 0 +DEF SEEK_CUR = 1 +DEF SEEK_END = 2 + ######################################################################## ######################################################################## ## Constants ######################################################################## +# maximum genomic coordinace cdef int MAX_POS = 2 << 29 + cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') @@ -35,6 +47,230 @@ cpdef get_verbosity(): return hts_get_verbosity() +cdef class HFile(object): + cdef hFILE *fp + cdef readonly object name, mode + + def __init__(self, name, mode='r', closedf=True): + self._open(name, mode, closefd=True) + + def __dealloc__(self): + self.close() + + @property + def closed(self): + return self.fp == NULL + + cdef _open(self, name, mode, closefd=True): + self.name = name + self.mode = mode + + mode = force_bytes(mode) + + if isinstance(name, int): + if self.fp != NULL: + name = dup(name) + self.fp = hdopen(name, mode) + else: + name = encode_filename(name) + self.fp = hopen(name, mode) + + if not self.fp: + raise OSError(errno, 'failed to open HFile', self.name) + + def close(self): + if self.fp == NULL: + return + + cdef hFILE *fp = self.fp + self.fp = NULL + + if hclose(fp) != 0: + raise OSError(herrno(self.fp), 'failed to close HFile', self.name) + + def fileno(self): + if self.fp == NULL: + raise OSError('operation on closed HFile') + if isinstance(self.name, int): + return self.name + else: + raise AttributeError('fileno not available') + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.close() + + def __iter__(self): + return self + + def __next__(self): + line = self.readline() + if not line: + raise StopIteration() + return line + + def flush(self): + if self.fp == NULL: + raise OSError('operation on closed HFile') + if hflush(self.fp) != 0: + raise OSError(herrno(self.fp), 'failed to flush HFile', self.name) + + def isatty(self): + if self.fp == NULL: + raise OSError('operation on closed HFile') + return False + + def readable(self): + return self.fp != NULL and 'r' in self.mode + + def read(self, Py_ssize_t size=-1): + if self.fp == NULL: + raise OSError('operation on closed HFile') + + if size == 0: + return b'' + + cdef list parts = [] + cdef bytes part + cdef Py_ssize_t chunk_size, ret, bytes_read = 0 + cdef char *cpart + + while size == -1 or bytes_read < size: + chunk_size = 4096 + if size != -1: + chunk_size = min(chunk_size, size - bytes_read) + + part = PyBytes_FromStringAndSize(NULL, chunk_size) + cpart = part + ret = hread(self.fp, cpart, chunk_size) + + if ret < 0: + OSError(herrno(self.fp), 'failed to read HFile', self.name) + elif not ret: + break + + bytes_read += ret + + if ret < chunk_size: + part = cpart[:ret] + + parts.append(part) + + return b''.join(parts) + + def readall(self): + return self.read() + + def readinto(self, buf): + if self.fp == NULL: + raise OSError('operation on closed HFile') + + size = len(buf) + + if size == 0: + return size + + mv = memoryview(buf) + ret = hread(self.fp, mv, size) + + if ret < 0: + OSError(herrno(self.fp), 'failed to read HFile', self.name) + + return ret + + def readline(self, Py_ssize_t size=-1): + if self.fp == NULL: + raise OSError('operation on closed HFile') + + if size == 0: + return b'' + + cdef list parts = [] + cdef bytes part + cdef Py_ssize_t chunk_size, ret, bytes_read = 0 + cdef char *cpart + + while size == -1 or bytes_read < size: + chunk_size = 4096 + if size != -1: + chunk_size = min(chunk_size, size - bytes_read) + + part = PyBytes_FromStringAndSize(NULL, chunk_size) + cpart = part + + # Python bytes objects allocate an extra byte for a null terminator + ret = hgetln(cpart, chunk_size+1, self.fp) + + if ret < 0: + OSError(herrno(self.fp), 'failed to read HFile', self.name) + elif not ret: + break + + bytes_read += ret + + if ret < chunk_size: + part = cpart[:ret] + cpart = part + + parts.append(part) + + if cpart[ret-1] == b'\n': + break + + return b''.join(parts) + + def readlines(self): + return list(self) + + def seek(self, Py_ssize_t offset, int whence=SEEK_SET): + if self.fp == NULL: + raise OSError('operation on closed HFile') + + cdef Py_ssize_t off = hseek(self.fp, offset, whence) + + if off < 0: + raise OSError(herrno(self.fp), 'seek failed on HFile', self.name) + + return off + + def tell(self): + if self.fp == NULL: + raise OSError('operation on closed HFile') + + ret = htell(self.fp) + + if ret < 0: + raise OSError(herrno(self.fp), 'tell failed on HFile', self.name) + + return ret + + def seekable(self): + return self.fp != NULL + + def truncate(self, size=None): + raise NotImplementedError() + + def writable(self): + return self.fp != NULL and 'w' in self.mode + + def write(self, bytes b): + if self.fp == NULL: + raise OSError('operation on closed HFile') + + got = hwrite(self.fp, b, len(b)) + + if got < 0: + raise OSError(herrno(self.fp), 'write failed on HFile', self.name) + + return got + + def writelines(self, lines): + for line in lines: + self.write(line) + + class CallableValue(object): def __init__(self, value): self.value = value @@ -62,11 +298,38 @@ cdef class HTSFile(object): self.htsfile = NULL self.duplicate_filehandle = True + def close(self): + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + def __dealloc__(self): if self.htsfile: hts_close(self.htsfile) self.htsfile = NULL + def check_truncation(self, ignore_truncation=False): + """Check if file is truncated.""" + if not self.htsfile: + return + + if self.htsfile.format.compression != bgzf: + return + + cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile) + if not bgzfp: + return + + cdef int ret = bgzf_check_EOF(bgzfp) + if ret < 0: + raise OSError(errno, 'error checking for EOF marker') + elif ret == 0: + msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename) + if ignore_truncation: + warn(msg) + else: + raise OSError(msg) + def __enter__(self): return self @@ -189,12 +452,15 @@ cdef class HTSFile(object): raise OSError('seek not available in streams') cdef int64_t ret - if self.htsfile.format.compression != no_compression: + if self.htsfile.format.compression == bgzf: with nogil: ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) - else: + elif self.htsfile.format.compression == no_compression: with nogil: ret = hts_useek(self.htsfile, offset, SEEK_SET) + else: + raise NotImplementedError("seek not implemented in files compressed by method {}".format( + self.htsfile.format.compression)) return ret def tell(self): @@ -205,12 +471,19 @@ cdef class HTSFile(object): raise OSError('tell not available in streams') cdef int64_t ret - if self.htsfile.format.compression != no_compression: + if self.htsfile.format.compression == bgzf: with nogil: ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) - else: + elif self.htsfile.format.compression == no_compression: with nogil: ret = hts_utell(self.htsfile) + elif self.htsfile.format.format == cram: + with nogil: + ret = htell(cram_fd_get_fp(self.htsfile.fp.cram)) + else: + raise NotImplementedError("seek not implemented in files compressed by method {}".format( + self.htsfile.format.compression)) + return ret cdef htsFile *_open_htsfile(self) except? NULL: @@ -227,7 +500,7 @@ cdef class HTSFile(object): fd = self.filename else: fd = self.filename.fileno() - + if self.duplicate_filehandle: dup_fd = dup(fd) else: diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd new file mode 100644 index 0000000..5fdc57f --- /dev/null +++ b/pysam/libcsamtools.pxd @@ -0,0 +1,3 @@ +cdef extern from "csamtools_util.h": + + int samtools_main(int argc, char *argv[]) diff --git a/pysam/libcsamtools.pyx b/pysam/libcsamtools.pyx new file mode 100644 index 0000000..cc60ace --- /dev/null +++ b/pysam/libcsamtools.pyx @@ -0,0 +1,2 @@ +def py_samtools(): + pass diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd index 12cd9dd..c986f03 100644 --- a/pysam/libctabix.pxd +++ b/pysam/libctabix.pxd @@ -81,6 +81,10 @@ cdef class asGTF(Parser): pass +cdef class asGFF3(Parser): + pass + + cdef class asBed(Parser): pass diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index 10dc23b..b10c0d0 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -9,7 +9,8 @@ # class TabixFile class wrapping tabix indexed files in bgzf format # # class asTuple Parser class for tuples -# class asGT Parser class for GTF formatted rows +# class asGTF Parser class for GTF formatted rows +# class asGFF3 Parser class for GFF3 formatted rows # class asBed Parser class for Bed formatted rows # class asVCF Parser class for VCF formatted rows # @@ -110,6 +111,42 @@ cdef class asTuple(Parser): return r +cdef class asGFF3(Parser): + '''converts a :term:`tabix row` into a GFF record with the following + fields: + + +----------+----------+-------------------------------+ + |*Column* |*Name* |*Content* | + +----------+----------+-------------------------------+ + |1 |contig |the chromosome name | + +----------+----------+-------------------------------+ + |2 |feature |The feature type | + +----------+----------+-------------------------------+ + |3 |source |The feature source | + +----------+----------+-------------------------------+ + |4 |start |genomic start coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |5 |end |genomic end coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |6 |score |feature score | + +----------+----------+-------------------------------+ + |7 |strand |strand | + +----------+----------+-------------------------------+ + |8 |frame |frame | + +----------+----------+-------------------------------+ + |9 |attributes|the attribute field | + +----------+----------+-------------------------------+ + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.GFF3Proxy r + r = ctabixproxies.GFF3Proxy(self.encoding) + r.copy(buffer, len) + return r + + cdef class asGTF(Parser): '''converts a :term:`tabix row` into a GTF record with the following fields: @@ -155,7 +192,7 @@ cdef class asGTF(Parser): r = ctabixproxies.GTFProxy(self.encoding) r.copy(buffer, len) return r - + cdef class asBed(Parser): '''converts a :term:`tabix row` into a bed record @@ -1178,6 +1215,7 @@ __all__ = [ "Tabixfile", "asTuple", "asGTF", + "asGFF3", "asVCF", "asBed", "GZIterator", diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd index 5317b81..edea701 100644 --- a/pysam/libctabixproxies.pxd +++ b/pysam/libctabixproxies.pxd @@ -25,19 +25,21 @@ cdef class TupleProxy: cdef copy(self, char * buffer, size_t nbytes, bint reset=*) cdef update(self, char * buffer, size_t nbytes) -cdef class GTFProxy(TupleProxy) : - cdef: - char * _attributes - cdef bint hasOwnAttributes +cdef class NamedTupleProxy(TupleProxy): + pass + +cdef class GTFProxy(NamedTupleProxy): + cdef object attribute_dict cpdef int getMaxFields(self) cpdef int getMinFields(self) - cdef char * getAttributes(self) -cdef class NamedTupleProxy(TupleProxy): + +cdef class GFF3Proxy(GTFProxy): pass + cdef class BedProxy(NamedTupleProxy): cdef: diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx index 9a8a678..dc434e0 100644 --- a/pysam/libctabixproxies.pyx +++ b/pysam/libctabixproxies.pyx @@ -10,18 +10,21 @@ from pysam.libcutils cimport encode_filename, from_string_and_size import collections + cdef char *StrOrEmpty(char * buffer): if buffer == NULL: return "" else: return buffer + cdef int isNew(char * p, char * buffer, size_t nbytes): """return True if `p` is located within `buffer` of size `nbytes` """ if p == NULL: return 0 - return not (buffer <= p < buffer + nbytes) + + return not (buffer <= p <= buffer + nbytes) cdef class TupleProxy: @@ -230,7 +233,7 @@ cdef class TupleProxy: self.nfields = field if self.nfields < self.getMinFields(): raise ValueError( - "parsing error: fewer that %i fields in line: %s" % + "parsing error: fewer than %i fields in line: %s" % (self.getMinFields(), buffer)) def _getindex(self, int index): @@ -268,7 +271,7 @@ cdef class TupleProxy: raise IndexError("list index out of range") if isNew(self.fields[idx], self.data, self.nbytes): - free(self.fields[idx] ) + free(self.fields[idx]) self.is_modified = 1 @@ -350,7 +353,62 @@ def quote(v): return str(v) -cdef class GTFProxy(TupleProxy): +cdef class NamedTupleProxy(TupleProxy): + + map_key2field = {} + + def __setattr__(self, key, value): + '''set attribute.''' + cdef int idx + idx, f = self.map_key2field[key] + if self.nfields < idx: + raise KeyError("field %s not set" % key) + TupleProxy.__setitem__(self, idx, str(value)) + + def __getattr__(self, key): + cdef int idx + idx, f = self.map_key2field[key] + if self.nfields < idx: + raise KeyError("field %s not set" % key) + if f == str: + return force_str(self.fields[idx], + self.encoding) + return f(self.fields[idx]) + + +cdef dot_or_float(v): + if v == "" or v == b".": + return None + else: + try: + return int(v) + except ValueError: + return float(v) + + +cdef dot_or_int(v): + if v == "" or v == b".": + return None + else: + return int(v) + + +cdef dot_or_str(v): + if v == "" or v == b".": + return None + else: + return force_str(v) + + +cdef int from1based(v): + return atoi(v) - 1 + + +cdef str to1based(int v): + return str(v + 1) + + +cdef class GTFProxy(NamedTupleProxy): '''Proxy class for access to GTF fields. This class represents a GTF entry for fast read-access. @@ -361,18 +419,29 @@ cdef class GTFProxy(TupleProxy): The only exception is the attributes field when set from a dictionary - this field will manage its own memory. + ''' + separator = "; " + # first value is field index, the tuple contains conversion + # functions for getting (converting internal string representation + # to pythonic value) and setting (converting pythonic value to + # interval string representation) + map_key2field = { + 'contig' : (0, (str, str)), + 'source' : (1, (dot_or_str, str)), + 'feature': (2, (dot_or_str, str)), + 'start' : (3, (from1based, to1based)), + 'end' : (4, (int, int)), + 'score' : (5, (dot_or_float, toDot)), + 'strand' : (6, (dot_or_str, str)), + 'frame' : (7, (dot_or_int, toDot)), + 'attributes': (8, (str, str))} + def __cinit__(self): # automatically calls TupleProxy.__cinit__ - self.hasOwnAttributes = False - self._attributes = NULL - - def __dealloc__(self): - # automatically calls TupleProxy.__dealloc__ - if self.hasOwnAttributes: - free(self._attributes) - + self.attribute_dict = None + cpdef int getMinFields(self): '''return minimum number of fields.''' return 9 @@ -381,182 +450,18 @@ cdef class GTFProxy(TupleProxy): '''return max number of fields.''' return 9 - property contig: - '''contig of feature.''' - def __get__(self): - return self._getindex(0) - def __set__(self, value): - self._setindex(0, value) - - property source: - '''feature source.''' - def __get__(self): - return self._getindex(1) - def __set__(self, value): - if value is None: - value = "." - self._setindex(1, value) - - property feature: - '''feature name.''' - def __get__(self): - return self._getindex(2) - def __set__(self, value): - if value is None: - value = "." - self._setindex(2, value) - - property start: - '''feature start (in 0-based open/closed coordinates).''' - def __get__(self ): - return int( self._getindex(3)) - 1 - def __set__(self, value ): - self._setindex(3, str(value+1)) - - property end: - '''feature end (in 0-based open/closed coordinates).''' - def __get__(self): - return int(self._getindex(4)) - def __set__(self, value): - self._setindex(4, str(value)) - - property score: - '''feature score.''' - def __get__(self): - v = self._getindex(5) - if v == "" or v[0] == '.': - return None - else: - return float(v) - - def __set__(self, value): - if value is None: - value = "." - self._setindex(5, str(value)) - - property strand: - '''feature strand.''' - def __get__(self): - return self._getindex(6) - def __set__(self, value ): - if value is None: - value = "." - self._setindex(6, value) - - property frame: - '''feature frame.''' - def __get__(self): - v = self._getindex(7) - if v == "" or v[0] == '.': - return v - else: - return int(v) - - def __set__(self, value): - if value is None: - value = "." - self._setindex(7, str(value)) - - property attributes: - '''feature attributes (as a string).''' - def __get__(self): - if self.hasOwnAttributes: - return force_str(self._attributes) - else: - return force_str(self._getindex(8)) - def __set__( self, value): - if self.hasOwnAttributes: - free(self._attributes) - self._attributes = NULL - self.hasOwnAttributes = False - self._setindex(8, value) - - cdef char * getAttributes(self): - '''return pointer to attributes.''' - cdef char * attributes - if self.hasOwnAttributes: - attributes = self._attributes - else: - attributes = self.fields[8] - if attributes == NULL: - raise KeyError("no attributes defined GTF entry") - return attributes - def asDict(self): """parse attributes - return as dict """ - - # remove comments - attributes = self.attributes - - # separate into fields - # Fields might contain a ";", for example in ENSEMBL GTF file - # for mouse, v78: - # ...; transcript_name "TXNRD2;-001"; .... - # The current heuristic is to split on a semicolon followed by a - # space, see also http://mblab.wustl.edu/GTF22.html - - # Remove white space to prevent a last empty field. - fields = [x.strip() for x in attributes.strip().split("; ")] - - result = collections.OrderedDict() - - for f in fields: - - # strip semicolon (GTF files without a space after the last semicolon) - if f.endswith(";"): - f = f[:-1] - - # split at most once in order to avoid separating - # multi-word values - d = [x.strip() for x in f.split(" ", 1)] - - n,v = d[0], d[1] - if len(d) > 2: - v = d[1:] - - if v[0] == '"' and v[-1] == '"': - v = v[1:-1] - else: - ## try to convert to a value - try: - v = float(v) - v = int(v) - except ValueError: - pass - except TypeError: - pass - - result[n] = v - - return result + return collections.OrderedDict(self.attribute_iterator()) def fromDict(self, d): '''set attributes from a dictionary.''' - cdef char * p - cdef int l - - # clean up if this field is set twice - if self.hasOwnAttributes: - free(self._attributes) - - aa = [] - for k,v in d.items(): - if isinstance(v, str): - aa.append( '%s "%s"' % (k,v) ) - else: - aa.append( '%s %s' % (k,str(v)) ) - - a = force_bytes("; ".join(aa) + ";") - p = a - l = len(a) - self._attributes = calloc(l + 1, sizeof(char)) - if self._attributes == NULL: - raise ValueError("out of memory") - memcpy(self._attributes, p, l) - - self.hasOwnAttributes = True - self.is_modified = True + self.attribute_dict = None + attribute_string = force_bytes( + self.attribute_dict2string(d), + self.encoding) + self._setindex(8, attribute_string) def __str__(self): cdef char * cpy @@ -565,9 +470,9 @@ cdef class GTFProxy(TupleProxy): if self.is_modified: return "\t".join( (self.contig, - self.source, - self.feature, - str(self.start+1), + toDot(self.source), + toDot(self.feature), + str(self.start + 1), str(self.end), toDot(self.score), toDot(self.strand), @@ -589,73 +494,26 @@ cdef class GTFProxy(TupleProxy): def keys(self): '''return a list of attributes defined in this entry.''' - r = self.attributes - return [x.strip().split(" ")[0] - # separator is ';' followed by space - for x in r.split("; ") if x.strip() != ''] + if not self.attribute_dict: + self.attribute_dict = self.attribute_string2dict( + self.attributes) + return self.attribute_dict.keys() def __getitem__(self, key): return self.__getattr__(key) - def __getattr__(self, item): - """Generic lookup of attribute from GFF/GTF attributes - Only called if there *isn't* an attribute with this name - """ - cdef char * start - cdef char * query - cdef char * cpy - cdef char * end - cdef int l - - # - # important to use the getAttributes function. - # Using the self.attributes property to access - # the attributes caused a hard-to-trace bug - # in which fields in the attribute string were - # set to 0. - # Running through valgrind complained that - # memory was accessed in the memory field - # that has been released. It is not clear - # why this happened and might be a cython bug - # (Version 0.16). The valgrind warnings - # disappeard after accessing the C data structures - # directly and so did the bug. - cdef char * attributes = self.getAttributes() - if attributes == NULL: - raise KeyError("key %s not found, no attributes" % item) - - # add space in order to make sure - # to not pick up a field that is a prefix of another field - r = force_bytes(item + " ") - query = r - start = strstr(attributes, query) - - if start == NULL: - raise AttributeError("'GTFProxy' has no attribute '%s'" % item) - - start += strlen(query) - # skip gaps before - while start[0] == ' ': - start += 1 - - if start[0] == '"': - start += 1 - end = start - while end[0] != '\0' and end[0] != '"': - end += 1 - l = end - start - result = force_str(PyBytes_FromStringAndSize(start, l), - self.encoding) - return result - else: - return force_str(start, self.encoding) - def setAttribute(self, name, value): - '''convenience method to set an attribute.''' - r = self.asDict() - r[name] = value - self.fromDict(r) - + '''convenience method to set an attribute. + ''' + if not self.attribute_dict: + self.attribute_dict = self.attribute_string2dict( + self.attributes) + self.attribute_dict[name] = value + + def attribute_string2dict(self, s): + return collections.OrderedDict( + self.attribute_string2iterator(s)) + def __cmp__(self, other): return (self.contig, self.strand, self.start) < \ (other.contig, other.strand, other.start) @@ -676,29 +534,148 @@ cdef class GTFProxy(TupleProxy): err_msg = "op {0} isn't implemented yet".format(op) raise NotImplementedError(err_msg) + def dict2attribute_string(self, d): + """convert dictionary to attribute string in GTF format. -cdef class NamedTupleProxy(TupleProxy): + """ + aa = [] + for k, v in d.items(): + if isinstance(v, str): + aa.append('{} "{}"'.format(k, v)) + else: + aa.append("{} {}".format(k, str(v))) - map_key2field = {} + return self.separator.join(aa) + ";" + + def attribute_string2iterator(self, s): + """convert attribute string in GTF format to records + and iterate over key, value pairs. + """ + + # remove comments + attributes = force_str(s, encoding=self.encoding) + + # separate into fields + # Fields might contain a ";", for example in ENSEMBL GTF file + # for mouse, v78: + # ...; transcript_name "TXNRD2;-001"; .... + # The current heuristic is to split on a semicolon followed by a + # space, see also http://mblab.wustl.edu/GTF22.html + + # Remove white space to prevent a last empty field. + fields = [x.strip() for x in attributes.strip().split("; ")] + for f in fields: + + # strip semicolon (GTF files without a space after the last semicolon) + if f.endswith(";"): + f = f[:-1] + + # split at most once in order to avoid separating + # multi-word values + d = [x.strip() for x in f.split(" ", 1)] + + n, v = d[0], d[1] + if len(d) > 2: + v = d[1:] + + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + else: + ## try to convert to a value + try: + v = float(v) + v = int(v) + except ValueError: + pass + except TypeError: + pass + + yield n, v + + def __getattr__(self, key): + """Generic lookup of attribute from GFF/GTF attributes + """ + + # Only called if there *isn't* an attribute with this name + cdef int idx + idx, f = self.map_key2field.get(key, (-1, None)) + if idx >= 0: + # deal with known attributes (fields 0-8) + if idx == 8: + # flush attributes if requested + if self.is_modified and self.attribute_dict is not None: + s = self.dict2attribute_string(self.attribute_dict) + TupleProxy._setindex(self, idx, s) + self.attribute_dict = None + return s + + if f[0] == str: + return force_str(self.fields[idx], + self.encoding) + else: + return f[0](self.fields[idx]) + else: + # deal with generic attributes (gene_id, ...) + if self.attribute_dict is None: + self.attribute_dict = self.attribute_string2dict( + self.attributes) + return self.attribute_dict[key] def __setattr__(self, key, value): '''set attribute.''' - cdef int idx - idx, f = self.map_key2field[key] - if self.nfields < idx: - raise KeyError("field %s not set" % key) - TupleProxy.__setitem__(self, idx, str(value)) - def __getattr__(self, key): + # Note that __setattr__ is called before properties, so __setattr__ and + # properties don't mix well. This is different from __getattr__ which is + # called after any properties have been resolved. cdef int idx - idx, f = self.map_key2field[key] - if self.nfields < idx: - raise KeyError("field %s not set" % key) - if f == str: - return force_str(self.fields[idx], - self.encoding) - return f(self.fields[idx]) + idx, f = self.map_key2field.get(key, (-1, None)) + + if idx >= 0: + if value is None: + s = "." + elif f[1] == str: + s = force_bytes(value, + self.encoding) + else: + s = str(f[1](value)) + TupleProxy._setindex(self, idx, s) + else: + if self.attribute_dict is None: + self.attribute_dict = self.attribute_string2dict( + self.attributes) + self.attribute_dict[key] = value + self.is_modified = True + + +cdef class GFF3Proxy(GTFProxy): + + def dict2attribute_string(self, d): + """convert dictionary to attribute string.""" + return ";".join(["{}={}".format(k, v) for k, v in d.items()]) + + def attribute_string2iterator(self, s): + """convert attribute string in GFF3 format to records + and iterate over key, value pairs. + """ + + for f in (x.strip() for x in s.split(";")): + if not f: + continue + key, value = f.split("=", 1) + value = value.strip() + + ## try to convert to a value + try: + value = float(value) + value = int(value) + except ValueError: + pass + except TypeError: + pass + + yield key.strip(), value + cdef class BedProxy(NamedTupleProxy): '''Proxy class for access to Bed fields. @@ -762,7 +739,7 @@ cdef class BedProxy(NamedTupleProxy): self.nfields = save_fields return retval - def __setattr__(self, key, value ): + def __setattr__(self, key, value): '''set attribute.''' if key == "start": self.start = value @@ -771,7 +748,8 @@ cdef class BedProxy(NamedTupleProxy): cdef int idx idx, f = self.map_key2field[key] - TupleProxy._setindex(self, idx, str(value) ) + TupleProxy._setindex(self, idx, str(value)) + cdef class VCFProxy(NamedTupleProxy): '''Proxy class for access to VCF fields. diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd index 81e544a..479d337 100644 --- a/pysam/libcutils.pxd +++ b/pysam/libcutils.pxd @@ -28,11 +28,11 @@ cdef from_string_and_size(const char *s, size_t length) cdef extern from "pysam_util.h": - int samtools_main(int argc, char *argv[]) - int bcftools_main(int argc, char *argv[]) void pysam_set_stderr(int fd) void pysam_unset_stderr() void pysam_set_stdout(int fd) void pysam_set_stdout_fn(const char *) void pysam_unset_stdout() void set_optind(int) + extern int samtools_main(int argc, char *argv[]) + extern int bcftools_main(int argc, char *argv[]) diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index 80bd9e4..2b90420 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -16,6 +16,9 @@ from libc.stdio cimport fprintf, stderr, fflush from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY +from libcbcftools cimport bcftools_main +from libcsamtools cimport samtools_main + ##################################################################### # hard-coded constants cdef int MAX_POS = 2 << 29 @@ -234,16 +237,22 @@ def _pysam_dispatch(collection, method, args=None, catch_stdout=True, + is_usage=False, save_stdout=None): '''call ``method`` in samtools/bcftools providing arguments in args. + By default, stdout is redirected to a temporary file using the patched + C sources except for a few commands that have an explicit output option + (typically: -o). In these commands (such as samtools view), this explicit + option is used. If *is_usage* is True, then these explicit output options + will not be used. + Catching of stdout can be turned off by setting *catch_stdout* to False. - ''' if method == "index": - if not os.path.exists(args[0]): + if args and not os.path.exists(args[0]): raise IOError("No such file or directory: '%s'" % args[0]) if args is None: @@ -267,17 +276,16 @@ def _pysam_dispatch(collection, pysam_set_stdout(stdout_h) elif catch_stdout: stdout_h, stdout_f = tempfile.mkstemp() - MAP_STDOUT_OPTIONS = { - "samtools": { - "view": "-o {}", - "mpileup": "-o {}", - "depad": "-o {}", - "calmd": "", # uses pysam_stdout_fn - }, + "samtools": { + "view": "-o {}", + "mpileup": "-o {}", + "depad": "-o {}", + "calmd": "", # uses pysam_stdout_fn + }, "bcftools": {} } - + stdout_option = None if collection == "bcftools": # in bcftools, most methods accept -o, the exceptions @@ -289,7 +297,7 @@ def _pysam_dispatch(collection, if not(method == "view" and "-c" in args): stdout_option = MAP_STDOUT_OPTIONS[collection][method] - if stdout_option is not None: + if stdout_option is not None and not is_usage: os.close(stdout_h) pysam_set_stdout_fn(force_bytes(stdout_f)) args.extend(stdout_option.format(stdout_f).split(" ")) diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c index 94717c8..5940a35 100644 --- a/pysam/pysam_util.c +++ b/pysam/pysam_util.c @@ -2,8 +2,10 @@ #include #include #include -#include "bam.h" -#include "bam_endian.h" + +/* #include "bam.h" */ +/* #include "bam_endian.h" */ + #include "htslib/khash.h" #include "htslib/ksort.h" #include "htslib/knetfile.h" diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h index a30808f..8627d96 100644 --- a/pysam/pysam_util.h +++ b/pysam/pysam_util.h @@ -34,4 +34,8 @@ int pysam_dispatch(int argc, char *argv[]); void set_optind(int); +extern int samtools_main(int argc, char *argv[]); + +extern int bcftools_main(int argc, char *argv[]); + #endif diff --git a/pysam/samfile_util.c b/pysam/samfile_util.c index f5724ae..b6917ed 100644 --- a/pysam/samfile_util.c +++ b/pysam/samfile_util.c @@ -1,8 +1,6 @@ #include "samfile_util.h" #include "htslib/sam.h" -#include "kprobaln.h" - // taken from bam_md.c // replace bam1_{qual,seq,cigar} with bam_get_{qual,seq,cigar} // bam1_seqi -> bam_seqi @@ -14,175 +12,5 @@ char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -int bam_cap_mapQ(bam1_t *b, char *ref, int thres) -{ - uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; - double t; - if (thres < 0) thres = 40; // set the default - mm = q = len = clip_l = clip_q = 0; - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int z = y + j; - int c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; - if (ref[x+j] == 0) break; // out of boundary - if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous - ++len; - if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch - ++mm; - q += qual[z] > 33? 33 : qual[z]; - } - } - } - if (j < l) break; - x += l; y += l; len += l; - } else if (op == BAM_CDEL) { - for (j = 0; j < l; ++j) - if (ref[x+j] == 0) break; - if (j < l) break; - x += l; - } else if (op == BAM_CSOFT_CLIP) { - for (j = 0; j < l; ++j) clip_q += qual[y+j]; - clip_l += l; - y += l; - } else if (op == BAM_CHARD_CLIP) { - clip_q += 13 * l; - clip_l += l; - } else if (op == BAM_CINS) y += l; - else if (op == BAM_CREF_SKIP) x += l; - } - for (i = 0, t = 1; i < mm; ++i) - t *= (double)len / (i+1); - t = q - 4.343 * log(t) + clip_q / 5.; - if (t > thres) return -1; - if (t < 0) t = 0; - t = sqrt((thres - t) / thres) * thres; -// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q); - return (int)(t + .499); -} - - -int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) -{ - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; - kpa_par_t conf = kpa_par_def; - uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b); - if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing - // test if BQ or ZQ is present - if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; - if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; - if (bq && redo_baq) - { - bam_aux_del(b, bq-1); - bq = 0; - } - if (bq && zq) { // remove the ZQ tag - bam_aux_del(b, zq-1); - zq = 0; - } - if (bq || zq) { - if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing - if (bq && apply_baq) { // then convert BQ to ZQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64); - *(bq - 3) = 'Z'; - } else if (zq && !apply_baq) { // then convert ZQ to BQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] += (int)zq[i] - 64; - *(zq - 3) = 'B'; - } - return 0; - } - // find the start and end of the alignment - x = c->pos, y = 0, yb = ye = xb = xe = -1; - for (k = 0; k < c->n_cigar; ++k) { - int op, l; - op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (yb < 0) yb = y; - if (xb < 0) xb = x; - ye = y + l; xe = x + l; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip - } - // set bandwidth and the start and the end - bw = 7; - if (abs((xe - xb) - (ye - yb)) > bw) - bw = abs((xe - xb) - (ye - yb)) + 3; - conf.bw = bw; - xb -= yb + bw/2; if (xb < 0) xb = 0; - xe += c->l_qseq - ye + bw/2; - if (xe - xb - c->l_qseq > bw) - xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; - { // glocal - uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq; - int *state; - bq = calloc(c->l_qseq + 1, 1); - memcpy(bq, qual, c->l_qseq); - s = calloc(c->l_qseq, 1); - for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam_seqi(seq, i)]; - r = calloc(xe - xb, 1); - for (i = xb; i < xe; ++i) { - if (ref[i] == 0) { xe = i; break; } - r[i-xb] = bam_nt16_nt4_table[seq_nt16_table[(int)ref[i]]]; - } - state = calloc(c->l_qseq, sizeof(int)); - q = calloc(c->l_qseq, 1); - kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); - if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) { - if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; - else bq[i] = bq[i] < q[i]? bq[i] : q[i]; - } - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ - } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) - uint8_t *left, *rght; - left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) - bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; - for (left[y] = bq[y], i = y + 1; i < y + l; ++i) - left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; - for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) - rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; - for (i = y; i < y + l; ++i) - bq[i] = left[i] < rght[i]? left[i] : rght[i]; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ - free(left); free(rght); - } - if (apply_baq) { - for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual - bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); - } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); - free(bq); free(s); free(r); free(q); free(state); - } - return 0; -} - -int bam_prob_realn(bam1_t *b, const char *ref) -{ - return bam_prob_realn_core(b, ref, 1); -} diff --git a/pysam/samfile_util.h b/pysam/samfile_util.h index dd3e27a..94ce096 100644 --- a/pysam/samfile_util.h +++ b/pysam/samfile_util.h @@ -3,8 +3,5 @@ #include "htslib/sam.h" -int bam_cap_mapQ(bam1_t *b, char *ref, int thres); -int bam_prob_realn(bam1_t *b, const char *ref); - #endif diff --git a/pysam/tabix_util.c b/pysam/tabix_util.c index bff140e..319808a 100644 --- a/pysam/tabix_util.c +++ b/pysam/tabix_util.c @@ -1,6 +1,7 @@ #include #include #include +#include #if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) /* diff --git a/pysam/utils.py b/pysam/utils.py index 5c045df..239f5db 100644 --- a/pysam/utils.py +++ b/pysam/utils.py @@ -92,7 +92,14 @@ class PysamDispatcher(object): def usage(self): '''return the samtools usage information for this command''' - retval, stderr, stdout = csamtools._samtools_dispatch( - self.dispatch) - return stderr + retval, stderr, stdout = _pysam_dispatch( + self.collection, + self.dispatch, + is_usage=True, + catch_stdout=True) + # some tools write usage to stderr, such as mpileup + if stderr: + return stderr + else: + return stdout diff --git a/pysam/version.py b/pysam/version.py index facb3bb..ac832cf 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,9 +1,10 @@ # pysam versioning information +__version__ = "0.11.2.2" -__version__ = "0.10.0" +# TODO: upgrade number +__samtools_version__ = "1.4.1" -__samtools_version__ = "1.3.1" +# TODO: upgrade code and number +__bcftools_version__ = "1.4.1" -__bcftools_version__ = "1.3.1" - -__htslib_version__ = "1.3.2" +__htslib_version__ = "1.4.1" diff --git a/samtools/bam.h b/samtools/bam.h index e928ce4..108987c 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ @copyright Genome Research Ltd. */ -#define BAM_VERSION "1.3.1" +#define BAM_VERSION "1.4.1" #include #include diff --git a/samtools/bam2bcf.c b/samtools/bam2bcf.c index 85ce307..a824d5a 100644 --- a/samtools/bam2bcf.c +++ b/samtools/bam2bcf.c @@ -29,11 +29,11 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include "bam2bcf.h" -#include "errmod.h" extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c index 6938ec0..3e3e01c 100644 --- a/samtools/bam2bcf.c.pysam.c +++ b/samtools/bam2bcf.c.pysam.c @@ -31,11 +31,11 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include "bam2bcf.h" -#include "errmod.h" extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); diff --git a/samtools/bam2bcf.h b/samtools/bam2bcf.h index 22c67cc..54e5faa 100644 --- a/samtools/bam2bcf.h +++ b/samtools/bam2bcf.h @@ -27,8 +27,8 @@ DEALINGS IN THE SOFTWARE. */ #define BAM2BCF_H #include +#include #include -#include "errmod.h" /** * A simplified version of Mann-Whitney U-test is calculated diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c index 5b353fc..9749d5b 100644 --- a/samtools/bam2bcf_indel.c +++ b/samtools/bam2bcf_indel.c @@ -28,9 +28,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/hts.h" #include "htslib/sam.h" #include "bam2bcf.h" -#include "kprobaln.h" #include "htslib/khash.h" KHASH_SET_INIT_STR(rg) @@ -359,7 +359,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla bca->indelreg = 0; for (t = 0; t < n_types; ++t) { int l, ir; - kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; + probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; apf1.bw = apf2.bw = abs(types[t]) + 3; // compute indelreg if (types[t] == 0) ir = 0; @@ -412,14 +412,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; } - sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below if (l > 255) l = 255; score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; if (sc > 5) { - sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); l = (int)(100. * sc / (qend - qbeg) + .499); if (l > 255) l = 255; score2[K*n_types + t] = sc<<8 | l; @@ -439,10 +439,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } free(ref2); free(query); { // compute indelQ - int *sc, tmp, *sumq; - sc = alloca(n_types * sizeof(int)); - sumq = alloca(n_types * sizeof(int)); - memset(sumq, 0, sizeof(int) * n_types); + int sc_a[16], sumq_a[16]; + int tmp, *sc = sc_a, *sumq = sumq_a; + if (n_types > 16) { + sc = (int *)malloc(n_types * sizeof(int)); + sumq = (int *)malloc(n_types * sizeof(int)); + } + memset(sumq, 0, n_types * sizeof(int)); for (s = K = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; @@ -523,6 +526,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } + + if (sc != sc_a) free(sc); + if (sumq != sumq_a) free(sumq); } free(score1); free(score2); // free diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c index 21cbb03..fcbc90f 100644 --- a/samtools/bam2bcf_indel.c.pysam.c +++ b/samtools/bam2bcf_indel.c.pysam.c @@ -30,9 +30,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/hts.h" #include "htslib/sam.h" #include "bam2bcf.h" -#include "kprobaln.h" #include "htslib/khash.h" KHASH_SET_INIT_STR(rg) @@ -361,7 +361,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla bca->indelreg = 0; for (t = 0; t < n_types; ++t) { int l, ir; - kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; + probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; apf1.bw = apf2.bw = abs(types[t]) + 3; // compute indelreg if (types[t] == 0) ir = 0; @@ -414,14 +414,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; } - sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below if (l > 255) l = 255; score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; if (sc > 5) { - sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); + sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); l = (int)(100. * sc / (qend - qbeg) + .499); if (l > 255) l = 255; score2[K*n_types + t] = sc<<8 | l; @@ -441,10 +441,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } free(ref2); free(query); { // compute indelQ - int *sc, tmp, *sumq; - sc = alloca(n_types * sizeof(int)); - sumq = alloca(n_types * sizeof(int)); - memset(sumq, 0, sizeof(int) * n_types); + int sc_a[16], sumq_a[16]; + int tmp, *sc = sc_a, *sumq = sumq_a; + if (n_types > 16) { + sc = (int *)malloc(n_types * sizeof(int)); + sumq = (int *)malloc(n_types * sizeof(int)); + } + memset(sumq, 0, n_types * sizeof(int)); for (s = K = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; @@ -525,6 +528,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } + + if (sc != sc_a) free(sc); + if (sumq != sumq_a) free(sumq); } free(score1); free(score2); // free diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index 21220f1..b732e8e 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -80,13 +80,13 @@ static int usage() { fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); fprintf(stderr, " -b list of positions or regions\n"); fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(stderr, " -l read length threshold (ignore reads shorter than )\n"); + fprintf(stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); fprintf(stderr, " -d/-m maximum coverage depth [8000]\n"); // the htslib's default - fprintf(stderr, " -q base quality threshold\n"); - fprintf(stderr, " -Q mapping quality threshold\n"); + fprintf(stderr, " -q base quality threshold [0]\n"); + fprintf(stderr, " -Q mapping quality threshold [0]\n"); fprintf(stderr, " -r region\n"); - sam_global_opt_help(stderr, "-.--."); + sam_global_opt_help(stderr, "-.--.-"); fprintf(stderr, "\n"); fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); @@ -99,7 +99,7 @@ static int usage() { int main_depth(int argc, char *argv[]) { - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; + int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; const bam_pileup1_t **plp; char *reg = 0; // specified region @@ -112,7 +112,7 @@ int main_depth(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), { NULL, 0, NULL, 0 } }; @@ -149,7 +149,7 @@ int main_depth(int argc, char *argv[]) else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input - beg = 0; end = INT_MAX; // set the default region + reg_tid = 0; beg = 0; end = INT_MAX; // set the default region for (i = 0; i < n; ++i) { int rf; data[i] = calloc(1, sizeof(aux_t)); @@ -199,6 +199,7 @@ int main_depth(int argc, char *argv[]) if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; + reg_tid = data[0]->iter->tid; } // the core multi-pileup loop @@ -210,12 +211,12 @@ int main_depth(int argc, char *argv[]) while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? - if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip if (all) { while (tid > last_tid) { - if (last_tid >= 0 && all > 1 && !reg) { - // Deal with remainder or entirety of last tid + if (last_tid >= 0 && !reg) { + // Deal with remainder or entirety of last tid. while (++last_pos < h->target_len[last_tid]) { + // Horribly inefficient, but the bed API is an obfuscated black box. if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); @@ -226,6 +227,8 @@ int main_depth(int argc, char *argv[]) } last_tid++; last_pos = -1; + if (all < 2) + break; } // Deal with missing portion of current tid @@ -242,6 +245,7 @@ int main_depth(int argc, char *argv[]) last_tid = tid; last_pos = pos; } + if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; @@ -260,7 +264,11 @@ int main_depth(int argc, char *argv[]) if (all) { // Handle terminating region - while (last_tid < h->n_targets) { + if (last_tid < 0 && reg && all > 1) { + last_tid = reg_tid; + last_pos = beg-1; + } + while (last_tid >= 0 && last_tid < h->n_targets) { while (++last_pos < h->target_len[last_tid]) { if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index 9d9dc40..4d9110b 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -82,13 +82,13 @@ static int usage() { fprintf(pysam_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); fprintf(pysam_stderr, " -b list of positions or regions\n"); fprintf(pysam_stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(pysam_stderr, " -l read length threshold (ignore reads shorter than )\n"); + fprintf(pysam_stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); fprintf(pysam_stderr, " -d/-m maximum coverage depth [8000]\n"); // the htslib's default - fprintf(pysam_stderr, " -q base quality threshold\n"); - fprintf(pysam_stderr, " -Q mapping quality threshold\n"); + fprintf(pysam_stderr, " -q base quality threshold [0]\n"); + fprintf(pysam_stderr, " -Q mapping quality threshold [0]\n"); fprintf(pysam_stderr, " -r region\n"); - sam_global_opt_help(pysam_stderr, "-.--."); + sam_global_opt_help(pysam_stderr, "-.--.-"); fprintf(pysam_stderr, "\n"); fprintf(pysam_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); @@ -101,7 +101,7 @@ static int usage() { int main_depth(int argc, char *argv[]) { - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; + int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; const bam_pileup1_t **plp; char *reg = 0; // specified region @@ -114,7 +114,7 @@ int main_depth(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), { NULL, 0, NULL, 0 } }; @@ -151,7 +151,7 @@ int main_depth(int argc, char *argv[]) else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input - beg = 0; end = INT_MAX; // set the default region + reg_tid = 0; beg = 0; end = INT_MAX; // set the default region for (i = 0; i < n; ++i) { int rf; data[i] = calloc(1, sizeof(aux_t)); @@ -201,6 +201,7 @@ int main_depth(int argc, char *argv[]) if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; + reg_tid = data[0]->iter->tid; } // the core multi-pileup loop @@ -212,12 +213,12 @@ int main_depth(int argc, char *argv[]) while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? - if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip if (all) { while (tid > last_tid) { - if (last_tid >= 0 && all > 1 && !reg) { - // Deal with remainder or entirety of last tid + if (last_tid >= 0 && !reg) { + // Deal with remainder or entirety of last tid. while (++last_pos < h->target_len[last_tid]) { + // Horribly inefficient, but the bed API is an obfuscated black box. if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); @@ -228,6 +229,8 @@ int main_depth(int argc, char *argv[]) } last_tid++; last_pos = -1; + if (all < 2) + break; } // Deal with missing portion of current tid @@ -244,6 +247,7 @@ int main_depth(int argc, char *argv[]) last_tid = tid; last_pos = pos; } + if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", pos+1); // a customized fprintf(pysam_stdout, ) would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; @@ -262,7 +266,11 @@ int main_depth(int argc, char *argv[]) if (all) { // Handle terminating region - while (last_tid < h->n_targets) { + if (last_tid < 0 && reg && all > 1) { + last_tid = reg_tid; + last_pos = beg-1; + } + while (last_tid >= 0 && last_tid < h->n_targets) { while (++last_pos < h->target_len[last_tid]) { if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) diff --git a/samtools/bam_addrprg.c b/samtools/bam_addrprg.c index f7bbfab..99a198d 100644 --- a/samtools/bam_addrprg.c +++ b/samtools/bam_addrprg.c @@ -1,6 +1,6 @@ /* bam_addrprg.c -- samtools command to add or replace readgroups. - Copyright (c) 2013, 2015 Genome Research Limited. + Copyright (c) 2013, 2015, 2016 Genome Research Limited. Author: Martin O. Pollard @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include "samtools.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include #include @@ -48,6 +49,7 @@ struct parsed_opts { char* rg_line; rg_mode mode; sam_global_args ga; + htsThreadPool p; }; struct state; @@ -69,6 +71,7 @@ static void cleanup_opts(parsed_opts_t* opts) free(opts->rg_id); free(opts->output_name); free(opts->input_name); + if (opts->p.pool) hts_tpool_destroy(opts->p.pool); sam_global_args_free(&opts->ga); free(opts); } @@ -131,6 +134,19 @@ static char* basic_unescape(const char* in) return tmp; } +// Malloc a string containing [s,slim) or to the end of s if slim is NULL. +// If lenp is non-NULL, stores the length of the resulting string there. +static char *dup_substring(const char *s, const char *slim, size_t *lenp) +{ + size_t len = slim? (slim - s) : strlen(s); + char *ns = malloc(len+1); + if (ns == NULL) return NULL; + memcpy(ns, s, len); + ns[len] = '\0'; + if (lenp) *lenp = len; + return ns; +} + // These are to be replaced by samtools header parser // Extracts the first @RG line from a string. static char* get_rg_line(const char* text, size_t* last) @@ -143,37 +159,17 @@ static char* get_rg_line(const char* text, size_t* last) rg++;//skip initial \n } // duplicate the line for return - char* line; - char* end = strchr(rg, '\n'); - if (end) { - line = strndup(rg,(end-rg)); - *last = end - rg; - } else { - line = strdup(rg); - *last = strlen(rg); - } - return line; + return dup_substring(rg, strchr(rg, '\n'), last); } // Given a @RG line return the id -static char* get_rg_id(const char* input) +static char* get_rg_id(const char *line) { - assert(input!=NULL); - char* line = strdup(input); - char *next = line; - char* token = strsep(&next, "\t"); - token = strsep(&next,"\t"); // skip first token it should always be "@RG" - while (next != NULL) { - char* key = strsep(&token,":"); - if (!strcmp(key,"ID")) { - char* retval = strdup(token); - free(line); - return retval; - } - token = strsep(&next,"\t"); - } - free(line); - return NULL; + const char *id = strstr(line, "\tID:"); + if (! id) return NULL; + + id += 4; + return dup_substring(id, strchr(id, '\t'), NULL); } // Confirms the existance of an RG line with a given ID in a bam header @@ -181,9 +177,8 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) { assert( hdr != NULL && rgid != NULL ); - char *ptr, *start; + const char *ptr = hdr->text; bool found = false; - start = ptr = strndup(hdr->text, hdr->l_text); while (ptr != NULL && *ptr != '\0' && found == false ) { size_t end = 0; char* line = get_rg_line(ptr, &end); @@ -196,16 +191,14 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) free(line); ptr += end; } - free(start); return found; } static char* get_first_rgid( const bam_hdr_t *hdr ) { assert( hdr != NULL ); - char *ptr, *start; + const char *ptr = hdr->text; char* found = NULL; - start = ptr = strndup(hdr->text, hdr->l_text); while (ptr != NULL && *ptr != '\0' && found == NULL ) { size_t end = 0; char* line = get_rg_line(ptr, &end); @@ -215,7 +208,6 @@ static char* get_first_rgid( const bam_hdr_t *hdr ) free(line); ptr += end; } - free(start); return found; } @@ -230,7 +222,7 @@ static void usage(FILE *fp) " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" ); - sam_global_opt_help(fp, "..O.."); + sam_global_opt_help(fp, "..O..@"); } static bool parse_args(int argc, char** argv, parsed_opts_t** opts) @@ -249,12 +241,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) retval->mode = overwrite_all; sam_global_args_init(&retval->ga); static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'), { NULL, 0, NULL, 0 } }; kstring_t rg_line = {0,0,NULL}; - while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) { + while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) { switch (n) { case 'r': // Are we adding to existing rg line? @@ -328,6 +320,13 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) } retval->input_name = strdup(argv[optind+0]); + if (retval->ga.nthreads > 0) { + if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) { + fprintf(stderr, "Error creating thread pool\n"); + return false; + } + } + *opts = retval; return true; } @@ -369,7 +368,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Open files retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in); if (retval->input_file == NULL) { - fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name); + print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name); return false; } retval->input_header = sam_hdr_read(retval->input_file); @@ -378,10 +377,15 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out); if (retval->output_file == NULL) { - print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name); + print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name); return false; } + if (opts->p.pool) { + hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p); + hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p); + } + if (opts->rg_line) { // Append new RG line to header. // Check does not already exist @@ -466,13 +470,13 @@ int main_addreplacerg(int argc, char** argv) if (!readgroupise(state)) goto error; - cleanup_opts(opts); cleanup_state(state); + cleanup_opts(opts); return EXIT_SUCCESS; error: - cleanup_opts(opts); cleanup_state(state); + cleanup_opts(opts); return EXIT_FAILURE; } diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c index 2ddd1b1..56986dd 100644 --- a/samtools/bam_addrprg.c.pysam.c +++ b/samtools/bam_addrprg.c.pysam.c @@ -2,7 +2,7 @@ /* bam_addrprg.c -- samtools command to add or replace readgroups. - Copyright (c) 2013, 2015 Genome Research Limited. + Copyright (c) 2013, 2015, 2016 Genome Research Limited. Author: Martin O. Pollard @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include "samtools.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include #include @@ -50,6 +51,7 @@ struct parsed_opts { char* rg_line; rg_mode mode; sam_global_args ga; + htsThreadPool p; }; struct state; @@ -71,6 +73,7 @@ static void cleanup_opts(parsed_opts_t* opts) free(opts->rg_id); free(opts->output_name); free(opts->input_name); + if (opts->p.pool) hts_tpool_destroy(opts->p.pool); sam_global_args_free(&opts->ga); free(opts); } @@ -133,6 +136,19 @@ static char* basic_unescape(const char* in) return tmp; } +// Malloc a string containing [s,slim) or to the end of s if slim is NULL. +// If lenp is non-NULL, stores the length of the resulting string there. +static char *dup_substring(const char *s, const char *slim, size_t *lenp) +{ + size_t len = slim? (slim - s) : strlen(s); + char *ns = malloc(len+1); + if (ns == NULL) return NULL; + memcpy(ns, s, len); + ns[len] = '\0'; + if (lenp) *lenp = len; + return ns; +} + // These are to be replaced by samtools header parser // Extracts the first @RG line from a string. static char* get_rg_line(const char* text, size_t* last) @@ -145,37 +161,17 @@ static char* get_rg_line(const char* text, size_t* last) rg++;//skip initial \n } // duplicate the line for return - char* line; - char* end = strchr(rg, '\n'); - if (end) { - line = strndup(rg,(end-rg)); - *last = end - rg; - } else { - line = strdup(rg); - *last = strlen(rg); - } - return line; + return dup_substring(rg, strchr(rg, '\n'), last); } // Given a @RG line return the id -static char* get_rg_id(const char* input) +static char* get_rg_id(const char *line) { - assert(input!=NULL); - char* line = strdup(input); - char *next = line; - char* token = strsep(&next, "\t"); - token = strsep(&next,"\t"); // skip first token it should always be "@RG" - while (next != NULL) { - char* key = strsep(&token,":"); - if (!strcmp(key,"ID")) { - char* retval = strdup(token); - free(line); - return retval; - } - token = strsep(&next,"\t"); - } - free(line); - return NULL; + const char *id = strstr(line, "\tID:"); + if (! id) return NULL; + + id += 4; + return dup_substring(id, strchr(id, '\t'), NULL); } // Confirms the existance of an RG line with a given ID in a bam header @@ -183,9 +179,8 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) { assert( hdr != NULL && rgid != NULL ); - char *ptr, *start; + const char *ptr = hdr->text; bool found = false; - start = ptr = strndup(hdr->text, hdr->l_text); while (ptr != NULL && *ptr != '\0' && found == false ) { size_t end = 0; char* line = get_rg_line(ptr, &end); @@ -198,16 +193,14 @@ static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) free(line); ptr += end; } - free(start); return found; } static char* get_first_rgid( const bam_hdr_t *hdr ) { assert( hdr != NULL ); - char *ptr, *start; + const char *ptr = hdr->text; char* found = NULL; - start = ptr = strndup(hdr->text, hdr->l_text); while (ptr != NULL && *ptr != '\0' && found == NULL ) { size_t end = 0; char* line = get_rg_line(ptr, &end); @@ -217,7 +210,6 @@ static char* get_first_rgid( const bam_hdr_t *hdr ) free(line); ptr += end; } - free(start); return found; } @@ -232,7 +224,7 @@ static void usage(FILE *fp) " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" ); - sam_global_opt_help(fp, "..O.."); + sam_global_opt_help(fp, "..O..@"); } static bool parse_args(int argc, char** argv, parsed_opts_t** opts) @@ -251,12 +243,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) retval->mode = overwrite_all; sam_global_args_init(&retval->ga); static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'), { NULL, 0, NULL, 0 } }; kstring_t rg_line = {0,0,NULL}; - while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) { + while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) { switch (n) { case 'r': // Are we adding to existing rg line? @@ -330,6 +322,13 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) } retval->input_name = strdup(argv[optind+0]); + if (retval->ga.nthreads > 0) { + if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) { + fprintf(pysam_stderr, "Error creating thread pool\n"); + return false; + } + } + *opts = retval; return true; } @@ -371,7 +370,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Open files retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in); if (retval->input_file == NULL) { - fprintf(pysam_stderr, "[init] Could not open input file: %s\n", opts->input_name); + print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name); return false; } retval->input_header = sam_hdr_read(retval->input_file); @@ -380,10 +379,15 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out); if (retval->output_file == NULL) { - print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name); + print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name); return false; } + if (opts->p.pool) { + hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p); + hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p); + } + if (opts->rg_line) { // Append new RG line to header. // Check does not already exist @@ -468,13 +472,13 @@ int main_addreplacerg(int argc, char** argv) if (!readgroupise(state)) goto error; - cleanup_opts(opts); cleanup_state(state); + cleanup_opts(opts); return EXIT_SUCCESS; error: - cleanup_opts(opts); cleanup_state(state); + cleanup_opts(opts); return EXIT_FAILURE; } diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c index 5c303d1..95498ec 100644 --- a/samtools/bam_cat.c +++ b/samtools/bam_cat.c @@ -40,6 +40,7 @@ Illumina. #include #include #include +#include #include "htslib/bgzf.h" #include "htslib/sam.h" @@ -468,7 +469,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) } if (in->block_offset < in->block_length) { - if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; if (bgzf_flush(fp) != 0) goto write_fail; } @@ -531,10 +532,12 @@ int main_cat(int argc, char *argv[]) { bam_hdr_t *h = 0; char *outfn = 0; + char **infns = NULL; // files to concatenate + int infns_size = 0; int c, ret = 0; samFile *in; - while ((c = getopt(argc, argv, "h:o:")) >= 0) { + while ((c = getopt(argc, argv, "h:o:b:")) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); @@ -553,29 +556,61 @@ int main_cat(int argc, char *argv[]) break; } case 'o': outfn = strdup(optarg); break; + case 'b': { + // add file names in "optarg" to the list + // of files to concatenate + int nfns; + char **fns_read = hts_readlines(optarg, &nfns); + if (fns_read) { + infns = realloc(infns, (infns_size + nfns) * sizeof(char*)); + if (infns == NULL) { ret = 1; goto end; } + memcpy(infns+infns_size, fns_read, nfns * sizeof(char*)); + infns_size += nfns; + free(fns_read); + } else { + print_error("cat", "Invalid file list \"%s\"", optarg); + ret = 1; + } + break; + } } } - if (argc - optind < 1) { - fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); + + // Append files specified in argv to the list. + int nargv_fns = argc - optind; + if (nargv_fns > 0) { + infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*)); + if (infns == NULL) { ret = 1; goto end; } + memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*)); + } + + // Require at least one input file + if (infns_size + nargv_fns == 0) { + fprintf(stderr, "Usage: samtools cat [options] [... ]\n"); + fprintf(stderr, " samtools cat [options] [... ]\n\n"); + fprintf(stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); + fprintf(stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); + fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); + fprintf(stderr, " -o FILE output BAM/CRAM\n"); return 1; } - in = sam_open(argv[optind], "r"); + in = sam_open(infns[0], "r"); if (!in) { - print_error_errno("cat", "failed to open file '%s'", argv[optind]); + print_error_errno("cat", "failed to open file '%s'", infns[0]); return 1; } switch (hts_get_format(in)->format) { case bam: sam_close(in); - if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) ret = 1; break; case cram: sam_close(in); - if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) ret = 1; break; @@ -584,7 +619,16 @@ int main_cat(int argc, char *argv[]) fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); return 1; } + + end: + if (infns_size > 0) { + int i; + for (i=0; i #include #include +#include #include "htslib/bgzf.h" #include "htslib/sam.h" @@ -470,7 +471,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) } if (in->block_offset < in->block_length) { - if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; if (bgzf_flush(fp) != 0) goto write_fail; } @@ -533,10 +534,12 @@ int main_cat(int argc, char *argv[]) { bam_hdr_t *h = 0; char *outfn = 0; + char **infns = NULL; // files to concatenate + int infns_size = 0; int c, ret = 0; samFile *in; - while ((c = getopt(argc, argv, "h:o:")) >= 0) { + while ((c = getopt(argc, argv, "h:o:b:")) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); @@ -555,29 +558,61 @@ int main_cat(int argc, char *argv[]) break; } case 'o': outfn = strdup(optarg); break; + case 'b': { + // add file names in "optarg" to the list + // of files to concatenate + int nfns; + char **fns_read = hts_readlines(optarg, &nfns); + if (fns_read) { + infns = realloc(infns, (infns_size + nfns) * sizeof(char*)); + if (infns == NULL) { ret = 1; goto end; } + memcpy(infns+infns_size, fns_read, nfns * sizeof(char*)); + infns_size += nfns; + free(fns_read); + } else { + print_error("cat", "Invalid file list \"%s\"", optarg); + ret = 1; + } + break; + } } } - if (argc - optind < 1) { - fprintf(pysam_stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); + + // Append files specified in argv to the list. + int nargv_fns = argc - optind; + if (nargv_fns > 0) { + infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*)); + if (infns == NULL) { ret = 1; goto end; } + memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*)); + } + + // Require at least one input file + if (infns_size + nargv_fns == 0) { + fprintf(pysam_stderr, "Usage: samtools cat [options] [... ]\n"); + fprintf(pysam_stderr, " samtools cat [options] [... ]\n\n"); + fprintf(pysam_stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); + fprintf(pysam_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); + fprintf(pysam_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); + fprintf(pysam_stderr, " -o FILE output BAM/CRAM\n"); return 1; } - in = sam_open(argv[optind], "r"); + in = sam_open(infns[0], "r"); if (!in) { - print_error_errno("cat", "failed to open file '%s'", argv[optind]); + print_error_errno("cat", "failed to open file '%s'", infns[0]); return 1; } switch (hts_get_format(in)->format) { case bam: sam_close(in); - if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) ret = 1; break; case cram: sam_close(in); - if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) ret = 1; break; @@ -586,7 +621,16 @@ int main_cat(int argc, char *argv[]) fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); return 1; } + + end: + if (infns_size > 0) { + int i; + for (i=0; i= 0) + while ((c = getopt(argc, argv, "bcm:@:")) >= 0) switch (c) { case 'b': csi = 0; break; case 'c': csi = 1; break; case 'm': csi = 1; min_shift = atoi(optarg); break; + case '@': n_threads = atoi(optarg); break; default: index_usage(stderr); return 1; @@ -70,18 +73,32 @@ int bam_index(int argc, char *argv[]) return 1; } - ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0); - if (ret != 0) { - if (ret == -2) - print_error_errno("index", "failed to open \"%s\"", argv[optind]); - else if (ret == -3) - print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]); + ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads); + switch (ret) { + case 0: + return 0; + + case -2: + print_error_errno("index", "failed to open \"%s\"", argv[optind]); + break; + + case -3: + print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]); + break; + + case -4: + if (argv[optind+1]) + print_error("index", "failed to create or write index \"%s\"", argv[optind+1]); else - print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]); - return EXIT_FAILURE; + print_error("index", "failed to create or write index"); + break; + + default: + print_error_errno("index", "failed to create index for \"%s\"", argv[optind]); + break; } - return 0; + return EXIT_FAILURE; } int bam_idxstats(int argc, char *argv[]) @@ -95,15 +112,20 @@ int bam_idxstats(int argc, char *argv[]) return 1; } fp = sam_open(argv[1], "r"); - if (fp == NULL) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; } + if (fp == NULL) { + print_error_errno("idxstats", "failed to open \"%s\"", argv[1]); + return 1; + } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(stderr, "[%s] failed to read header for '%s'.\n", - __func__, argv[1]); + print_error("idxstats", "failed to read header for \"%s\"", argv[1]); return 1; } idx = sam_index_load(fp, argv[1]); - if (idx == NULL) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; } + if (idx == NULL) { + print_error("idxstats", "fail to load index for \"%s\"", argv[1]); + return 1; + } int i; for (i = 0; i < header->n_targets; ++i) { diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c index 6c0efdc..a91ee76 100644 --- a/samtools/bam_index.c.pysam.c +++ b/samtools/bam_index.c.pysam.c @@ -48,20 +48,23 @@ static void index_usage(FILE *fp) "Options:\n" " -b Generate BAI-format index for BAM files [default]\n" " -c Generate CSI-format index for BAM files\n" -" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n", BAM_LIDX_SHIFT); +" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n" +" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); } int bam_index(int argc, char *argv[]) { int csi = 0; int min_shift = BAM_LIDX_SHIFT; + int n_threads = 0; int c, ret; - while ((c = getopt(argc, argv, "bcm:")) >= 0) + while ((c = getopt(argc, argv, "bcm:@:")) >= 0) switch (c) { case 'b': csi = 0; break; case 'c': csi = 1; break; case 'm': csi = 1; min_shift = atoi(optarg); break; + case '@': n_threads = atoi(optarg); break; default: index_usage(pysam_stderr); return 1; @@ -72,18 +75,32 @@ int bam_index(int argc, char *argv[]) return 1; } - ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0); - if (ret != 0) { - if (ret == -2) - print_error_errno("index", "failed to open \"%s\"", argv[optind]); - else if (ret == -3) - print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]); + ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads); + switch (ret) { + case 0: + return 0; + + case -2: + print_error_errno("index", "failed to open \"%s\"", argv[optind]); + break; + + case -3: + print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]); + break; + + case -4: + if (argv[optind+1]) + print_error("index", "failed to create or write index \"%s\"", argv[optind+1]); else - print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]); - return EXIT_FAILURE; + print_error("index", "failed to create or write index"); + break; + + default: + print_error_errno("index", "failed to create index for \"%s\"", argv[optind]); + break; } - return 0; + return EXIT_FAILURE; } int bam_idxstats(int argc, char *argv[]) @@ -97,15 +114,20 @@ int bam_idxstats(int argc, char *argv[]) return 1; } fp = sam_open(argv[1], "r"); - if (fp == NULL) { fprintf(pysam_stderr, "[%s] fail to open BAM.\n", __func__); return 1; } + if (fp == NULL) { + print_error_errno("idxstats", "failed to open \"%s\"", argv[1]); + return 1; + } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(pysam_stderr, "[%s] failed to read header for '%s'.\n", - __func__, argv[1]); + print_error("idxstats", "failed to read header for \"%s\"", argv[1]); return 1; } idx = sam_index_load(fp, argv[1]); - if (idx == NULL) { fprintf(pysam_stderr, "[%s] fail to load the index.\n", __func__); return 1; } + if (idx == NULL) { + print_error("idxstats", "fail to load index for \"%s\"", argv[1]); + return 1; + } int i; for (i = 0; i < header->n_targets; ++i) { diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 5b13b2e..75c2f51 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -1,6 +1,6 @@ /* bam_mate.c -- fix mate pairing information and clean up flags. - Copyright (C) 2009, 2011-2016 Genome Research Ltd. + Copyright (C) 2009, 2011-2017 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "htslib/kstring.h" #include "htslib/sam.h" @@ -155,9 +156,30 @@ static bool plausibly_properly_paired(bam1_t* a, bam1_t* b) return false; } -static void sync_mq(bam1_t* src, bam1_t* dest) +// Returns 0 on success, -1 on failure. +static int bam_format_cigar(const bam1_t* b, kstring_t* str) +{ + // An empty cigar is a special case return "*" rather than "" + if (b->core.n_cigar == 0) { + return (kputc('*', str) == EOF) ? -1 : 0; + } + + const uint32_t *cigar = bam_get_cigar(b); + uint32_t i; + + for (i = 0; i < b->core.n_cigar; ++i) { + if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1; + if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1; + } + + return 0; +} + +// Returns 0 on success, -1 on failure. +static int sync_mq_mc(bam1_t* src, bam1_t* dest) { if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped + // Copy Mate Mapping Quality uint32_t mq = src->core.qual; uint8_t* data; if ((data = bam_aux_get(dest,"MQ")) != NULL) { @@ -166,17 +188,34 @@ static void sync_mq(bam1_t* src, bam1_t* dest) bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq); } + // Copy mate cigar if either read is mapped + if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) { + uint8_t* data_mc; + if ((data_mc = bam_aux_get(dest,"MC")) != NULL) { + bam_aux_del(dest, data_mc); + } + + // Convert cigar to string + kstring_t mc = { 0, 0, NULL }; + if (bam_format_cigar(src, &mc) < 0) return -1; + + bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc)); + free(mc.s); + } + return 0; } -// copy flags -static void sync_mate(bam1_t* a, bam1_t* b) +// Copy flags. +// Returns 0 on success, -1 on failure. +static int sync_mate(bam1_t* a, bam1_t* b) { sync_unmapped_pos_inner(a,b); sync_unmapped_pos_inner(b,a); sync_mate_inner(a,b); sync_mate_inner(b,a); - sync_mq(a,b); - sync_mq(b,a); + if (sync_mq_mc(a,b) < 0) return -1; + if (sync_mq_mc(b,a) < 0) return -1; + return 0; } // currently, this function ONLY works if each read has one hit @@ -239,7 +278,7 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; cur->core.flag |= BAM_FPAIRED; - sync_mate(pre, cur); + if (sync_mate(pre, cur)) goto fail; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE @@ -324,7 +363,7 @@ void usage(FILE* where) " -p Disable FR proper pair check\n" " -c Add template cigar ct tag\n"); - sam_global_opt_help(where, "-.O.."); + sam_global_opt_help(where, "-.O..@"); fprintf(where, "\n" @@ -335,18 +374,19 @@ void usage(FILE* where) int bam_mating(int argc, char *argv[]) { + htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { NULL, 0, NULL, 0 } }; // parse args if (argc == 1) { usage(stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; @@ -369,6 +409,15 @@ int bam_mating(int argc, char *argv[]) goto fail; } + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(stderr, "Error creating thread pool\n"); + goto fail; + } + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + } + // run res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); @@ -379,12 +428,14 @@ int bam_mating(int argc, char *argv[]) res = 1; } + if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(&ga); return res; fail: if (in) sam_close(in); if (out) sam_close(out); + if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(&ga); return 1; } diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index a416d07..a03de96 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -2,7 +2,7 @@ /* bam_mate.c -- fix mate pairing information and clean up flags. - Copyright (C) 2009, 2011-2016 Genome Research Ltd. + Copyright (C) 2009, 2011-2017 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "htslib/kstring.h" #include "htslib/sam.h" @@ -157,9 +158,30 @@ static bool plausibly_properly_paired(bam1_t* a, bam1_t* b) return false; } -static void sync_mq(bam1_t* src, bam1_t* dest) +// Returns 0 on success, -1 on failure. +static int bam_format_cigar(const bam1_t* b, kstring_t* str) +{ + // An empty cigar is a special case return "*" rather than "" + if (b->core.n_cigar == 0) { + return (kputc('*', str) == EOF) ? -1 : 0; + } + + const uint32_t *cigar = bam_get_cigar(b); + uint32_t i; + + for (i = 0; i < b->core.n_cigar; ++i) { + if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1; + if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1; + } + + return 0; +} + +// Returns 0 on success, -1 on failure. +static int sync_mq_mc(bam1_t* src, bam1_t* dest) { if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped + // Copy Mate Mapping Quality uint32_t mq = src->core.qual; uint8_t* data; if ((data = bam_aux_get(dest,"MQ")) != NULL) { @@ -168,17 +190,34 @@ static void sync_mq(bam1_t* src, bam1_t* dest) bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq); } + // Copy mate cigar if either read is mapped + if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) { + uint8_t* data_mc; + if ((data_mc = bam_aux_get(dest,"MC")) != NULL) { + bam_aux_del(dest, data_mc); + } + + // Convert cigar to string + kstring_t mc = { 0, 0, NULL }; + if (bam_format_cigar(src, &mc) < 0) return -1; + + bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc)); + free(mc.s); + } + return 0; } -// copy flags -static void sync_mate(bam1_t* a, bam1_t* b) +// Copy flags. +// Returns 0 on success, -1 on failure. +static int sync_mate(bam1_t* a, bam1_t* b) { sync_unmapped_pos_inner(a,b); sync_unmapped_pos_inner(b,a); sync_mate_inner(a,b); sync_mate_inner(b,a); - sync_mq(a,b); - sync_mq(b,a); + if (sync_mq_mc(a,b) < 0) return -1; + if (sync_mq_mc(b,a) < 0) return -1; + return 0; } // currently, this function ONLY works if each read has one hit @@ -241,7 +280,7 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; cur->core.flag |= BAM_FPAIRED; - sync_mate(pre, cur); + if (sync_mate(pre, cur)) goto fail; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE @@ -326,7 +365,7 @@ void usage(FILE* where) " -p Disable FR proper pair check\n" " -c Add template cigar ct tag\n"); - sam_global_opt_help(where, "-.O.."); + sam_global_opt_help(where, "-.O..@"); fprintf(where, "\n" @@ -337,18 +376,19 @@ void usage(FILE* where) int bam_mating(int argc, char *argv[]) { + htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { NULL, 0, NULL, 0 } }; // parse args if (argc == 1) { usage(pysam_stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; @@ -371,6 +411,15 @@ int bam_mating(int argc, char *argv[]) goto fail; } + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(pysam_stderr, "Error creating thread pool\n"); + goto fail; + } + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + } + // run res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); @@ -381,12 +430,14 @@ int bam_mating(int argc, char *argv[]) res = 1; } + if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(&ga); return res; fail: if (in) sam_close(in); if (out) sam_close(out); + if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(&ga); return 1; } diff --git a/samtools/bam_md.c b/samtools/bam_md.c index 71206cd..f095030 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -25,15 +25,15 @@ DEALINGS IN THE SOFTWARE. */ #include -#include +#include +#include #include #include #include -#include #include "htslib/faidx.h" #include "htslib/sam.h" #include "htslib/kstring.h" -#include "kprobaln.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "samtools.h" @@ -161,178 +161,6 @@ void bam_fillmd1(bam1_t *b, char *ref, int flag) bam_fillmd1_core(b, ref, INT_MAX, flag, 0); } -int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres) -{ - uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; - double t; - if (thres < 0) thres = 40; // set the default - mm = q = len = clip_l = clip_q = 0; - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int c1, c2, z = y + j; - if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; - if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous - ++len; - if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch - ++mm; - q += qual[z] > 33? 33 : qual[z]; - } - } - } - if (j < l) break; - x += l; y += l; len += l; - } else if (op == BAM_CDEL) { - for (j = 0; j < l; ++j) - if (x+j >= ref_len || ref[x+j] == '\0') break; - if (j < l) break; - x += l; - } else if (op == BAM_CSOFT_CLIP) { - for (j = 0; j < l; ++j) clip_q += qual[y+j]; - clip_l += l; - y += l; - } else if (op == BAM_CHARD_CLIP) { - clip_q += 13 * l; - clip_l += l; - } else if (op == BAM_CINS) y += l; - else if (op == BAM_CREF_SKIP) x += l; - } - for (i = 0, t = 1; i < mm; ++i) - t *= (double)len / (i+1); - t = q - 4.343 * log(t) + clip_q / 5.; - if (t > thres) return -1; - if (t < 0) t = 0; - t = sqrt((thres - t) / thres) * thres; -// fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q); - return (int)(t + .499); -} - -int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag) -{ - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; - kpa_par_t conf = kpa_par_def; - uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b); - if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1) - return -1; // do nothing - - // test if BQ or ZQ is present - if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; - if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; - if (bq && redo_baq) - { - bam_aux_del(b, bq-1); - bq = 0; - } - if (bq && zq) { // remove the ZQ tag - bam_aux_del(b, zq-1); - zq = 0; - } - if (bq || zq) { - if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing - if (bq && apply_baq) { // then convert BQ to ZQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64); - *(bq - 3) = 'Z'; - } else if (zq && !apply_baq) { // then convert ZQ to BQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] += (int)zq[i] - 64; - *(zq - 3) = 'B'; - } - return 0; - } - // find the start and end of the alignment - x = c->pos, y = 0, yb = ye = xb = xe = -1; - for (k = 0; k < c->n_cigar; ++k) { - int op, l; - op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (yb < 0) yb = y; - if (xb < 0) xb = x; - ye = y + l; xe = x + l; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip - } - // set bandwidth and the start and the end - bw = 7; - if (abs((xe - xb) - (ye - yb)) > bw) - bw = abs((xe - xb) - (ye - yb)) + 3; - conf.bw = bw; - xb -= yb + bw/2; if (xb < 0) xb = 0; - xe += c->l_qseq - ye + bw/2; - if (xe - xb - c->l_qseq > bw) - xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; - { // glocal - uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq; - int *state; - bq = calloc(c->l_qseq + 1, 1); - memcpy(bq, qual, c->l_qseq); - s = calloc(c->l_qseq, 1); - for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)]; - r = calloc(xe - xb, 1); - for (i = xb; i < xe; ++i) { - if (i >= ref_len || ref[i] == '\0') { xe = i; break; } - r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]]; - } - state = calloc(c->l_qseq, sizeof(int)); - q = calloc(c->l_qseq, 1); - kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); - if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) { - if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; - else bq[i] = bq[i] < q[i]? bq[i] : q[i]; - } - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ - } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) - uint8_t *left, *rght; - left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) - bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; - for (left[y] = bq[y], i = y + 1; i < y + l; ++i) - left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; - for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) - rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; - for (i = y; i < y + l; ++i) - bq[i] = left[i] < rght[i]? left[i] : rght[i]; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ - free(left); free(rght); - } - if (apply_baq) { - for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual - bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); - } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); - free(bq); free(s); free(r); free(q); free(state); - } - return 0; -} - -int bam_prob_realn(bam1_t *b, const char *ref) -{ - return bam_prob_realn_core(b, ref, INT_MAX, 1); -} - int calmd_usage() { fprintf(stderr, "Usage: samtools calmd [-eubrAES] \n" @@ -345,13 +173,14 @@ int calmd_usage() { " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" " -E extended BAQ for better sensitivity but lower specificity\n"); - sam_global_opt_help(stderr, "-...."); + sam_global_opt_help(stderr, "-....@"); return 1; } int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; + htsThreadPool p = {NULL, 0}; samFile *fp = NULL, *fpout = NULL; bam_hdr_t *header = NULL; faidx_t *fai = NULL; @@ -360,14 +189,14 @@ int bam_fillmd(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), { NULL, 0, NULL, 0 } }; flt_flag = UPDATE_NM | UPDATE_MD; is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; strcpy(mode_w, "w"); - while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) { switch (c) { case 'r': is_realn = 1; break; case 'e': flt_flag |= USE_EQUAL; break; @@ -415,6 +244,15 @@ int bam_fillmd(int argc, char *argv[]) goto fail; } + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(stderr, "Error creating thread pool\n"); + goto fail; + } + hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p); + } + ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); @@ -440,9 +278,9 @@ int bam_fillmd(int argc, char *argv[]) if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } - if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag); + if (is_realn) sam_prob_realn(b, ref, len, baq_flag); if (capQ > 10) { - int q = bam_cap_mapQ(b, ref, len, capQ); + int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); @@ -466,6 +304,8 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] error when closing output file\n"); return 1; } + if (p.pool) hts_tpool_destroy(p.pool); + return 0; fail: @@ -475,5 +315,7 @@ int bam_fillmd(int argc, char *argv[]) if (fai) fai_destroy(fai); if (fp) sam_close(fp); if (fpout) sam_close(fpout); + if (p.pool) hts_tpool_destroy(p.pool); + return 1; } diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index d00c01d..5e4cdb5 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -27,15 +27,15 @@ DEALINGS IN THE SOFTWARE. */ #include -#include +#include +#include #include #include #include -#include #include "htslib/faidx.h" #include "htslib/sam.h" #include "htslib/kstring.h" -#include "kprobaln.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "samtools.h" @@ -163,178 +163,6 @@ void bam_fillmd1(bam1_t *b, char *ref, int flag) bam_fillmd1_core(b, ref, INT_MAX, flag, 0); } -int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres) -{ - uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; - double t; - if (thres < 0) thres = 40; // set the default - mm = q = len = clip_l = clip_q = 0; - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int c1, c2, z = y + j; - if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; - if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous - ++len; - if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch - ++mm; - q += qual[z] > 33? 33 : qual[z]; - } - } - } - if (j < l) break; - x += l; y += l; len += l; - } else if (op == BAM_CDEL) { - for (j = 0; j < l; ++j) - if (x+j >= ref_len || ref[x+j] == '\0') break; - if (j < l) break; - x += l; - } else if (op == BAM_CSOFT_CLIP) { - for (j = 0; j < l; ++j) clip_q += qual[y+j]; - clip_l += l; - y += l; - } else if (op == BAM_CHARD_CLIP) { - clip_q += 13 * l; - clip_l += l; - } else if (op == BAM_CINS) y += l; - else if (op == BAM_CREF_SKIP) x += l; - } - for (i = 0, t = 1; i < mm; ++i) - t *= (double)len / (i+1); - t = q - 4.343 * log(t) + clip_q / 5.; - if (t > thres) return -1; - if (t < 0) t = 0; - t = sqrt((thres - t) / thres) * thres; -// fprintf(pysam_stderr, "%s %lf %d\n", bam_get_qname(b), t, q); - return (int)(t + .499); -} - -int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag) -{ - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; - kpa_par_t conf = kpa_par_def; - uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b); - if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1) - return -1; // do nothing - - // test if BQ or ZQ is present - if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; - if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; - if (bq && redo_baq) - { - bam_aux_del(b, bq-1); - bq = 0; - } - if (bq && zq) { // remove the ZQ tag - bam_aux_del(b, zq-1); - zq = 0; - } - if (bq || zq) { - if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing - if (bq && apply_baq) { // then convert BQ to ZQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64); - *(bq - 3) = 'Z'; - } else if (zq && !apply_baq) { // then convert ZQ to BQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] += (int)zq[i] - 64; - *(zq - 3) = 'B'; - } - return 0; - } - // find the start and end of the alignment - x = c->pos, y = 0, yb = ye = xb = xe = -1; - for (k = 0; k < c->n_cigar; ++k) { - int op, l; - op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (yb < 0) yb = y; - if (xb < 0) xb = x; - ye = y + l; xe = x + l; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip - } - // set bandwidth and the start and the end - bw = 7; - if (abs((xe - xb) - (ye - yb)) > bw) - bw = abs((xe - xb) - (ye - yb)) + 3; - conf.bw = bw; - xb -= yb + bw/2; if (xb < 0) xb = 0; - xe += c->l_qseq - ye + bw/2; - if (xe - xb - c->l_qseq > bw) - xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; - { // glocal - uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq; - int *state; - bq = calloc(c->l_qseq + 1, 1); - memcpy(bq, qual, c->l_qseq); - s = calloc(c->l_qseq, 1); - for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)]; - r = calloc(xe - xb, 1); - for (i = xb; i < xe; ++i) { - if (i >= ref_len || ref[i] == '\0') { xe = i; break; } - r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]]; - } - state = calloc(c->l_qseq, sizeof(int)); - q = calloc(c->l_qseq, 1); - kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); - if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) { - if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; - else bq[i] = bq[i] < q[i]? bq[i] : q[i]; - } - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ - } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) - uint8_t *left, *rght; - left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) - bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; - for (left[y] = bq[y], i = y + 1; i < y + l; ++i) - left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; - for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) - rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; - for (i = y; i < y + l; ++i) - bq[i] = left[i] < rght[i]? left[i] : rght[i]; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ - free(left); free(rght); - } - if (apply_baq) { - for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual - bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); - } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); - free(bq); free(s); free(r); free(q); free(state); - } - return 0; -} - -int bam_prob_realn(bam1_t *b, const char *ref) -{ - return bam_prob_realn_core(b, ref, INT_MAX, 1); -} - int calmd_usage() { fprintf(pysam_stderr, "Usage: samtools calmd [-eubrAES] \n" @@ -347,13 +175,14 @@ int calmd_usage() { " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" " -E extended BAQ for better sensitivity but lower specificity\n"); - sam_global_opt_help(pysam_stderr, "-...."); + sam_global_opt_help(pysam_stderr, "-....@"); return 1; } int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; + htsThreadPool p = {NULL, 0}; samFile *fp = NULL, *fpout = NULL; bam_hdr_t *header = NULL; faidx_t *fai = NULL; @@ -362,14 +191,14 @@ int bam_fillmd(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), { NULL, 0, NULL, 0 } }; flt_flag = UPDATE_NM | UPDATE_MD; is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; strcpy(mode_w, "w"); - while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) { switch (c) { case 'r': is_realn = 1; break; case 'e': flt_flag |= USE_EQUAL; break; @@ -406,7 +235,7 @@ int bam_fillmd(int argc, char *argv[]) fprintf(pysam_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); goto fail; } - + fpout = sam_open_format(pysam_stdout_fn, mode_w, &ga.out); if (fpout == NULL) { print_error_errno("calmd", "Failed to open output"); @@ -417,6 +246,15 @@ int bam_fillmd(int argc, char *argv[]) goto fail; } + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(pysam_stderr, "Error creating thread pool\n"); + goto fail; + } + hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p); + } + ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); @@ -442,9 +280,9 @@ int bam_fillmd(int argc, char *argv[]) if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } - if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag); + if (is_realn) sam_prob_realn(b, ref, len, baq_flag); if (capQ > 10) { - int q = bam_cap_mapQ(b, ref, len, capQ); + int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); @@ -468,6 +306,8 @@ int bam_fillmd(int argc, char *argv[]) fprintf(pysam_stderr, "[bam_fillmd] error when closing output file\n"); return 1; } + if (p.pool) hts_tpool_destroy(p.pool); + return 0; fail: @@ -477,5 +317,7 @@ int bam_fillmd(int argc, char *argv[]) if (fai) fai_destroy(fai); if (fp) sam_close(fp); if (fpout) sam_close(fpout); + if (p.pool) hts_tpool_destroy(p.pool); + return 1; } diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index dc12bf3..d17e9d6 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -118,7 +119,7 @@ void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all; int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels @@ -209,11 +210,22 @@ static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { return 1; } +static void +print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, + int pos, int n, const char *ref, int ref_len) +{ + int i; + fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + fputs("\t0\t*\t*", fp); + if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); + if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); + } + putc('\n', fp); +} + static int mplp_func(void *data, bam1_t *b) { - extern int bam_realn(bam1_t *b, const char *ref); - extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag); - extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres); char *ref; mplp_aux_t *ma = (mplp_aux_t*)data; int ret, skip = 0, ref_len; @@ -229,7 +241,7 @@ static int mplp_func(void *data, bam1_t *b) } if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } - if (ma->conf->bed) { // test overlap + if (ma->conf->bed && ma->conf->all == 0) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); if (skip) continue; } @@ -258,9 +270,9 @@ static int mplp_func(void *data, bam1_t *b) } skip = 0; - if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); if (has_ref && ma->conf->capQ_thres > 10) { - int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres); + int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; } @@ -308,7 +320,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth; + int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth; const bam_pileup1_t **plp; mplp_ref_t mp_ref = MPLP_REF_INIT; bam_mplp_t iter; @@ -379,7 +391,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(EXIT_FAILURE); } - if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; + if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid; hts_idx_destroy(idx); } else @@ -551,14 +563,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; + int last_tid = -1, last_pos = -1; + // begin pileup while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; mplp_get_ref(data[0], tid, &ref, &ref_len); //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; @@ -584,6 +598,35 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } } else { + if (conf->all) { + // Deal with missing portions of previous tids + while (tid > last_tid) { + if (last_tid >= 0 && !conf->reg) { + while (++last_pos < h->target_len[last_tid]) { + if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); + } + } + last_tid++; + last_pos = -1; + if (conf->all < 2) + break; + } + } + if (conf->all) { + // Deal with missing portion of current tid + while (++last_pos < pos) { + if (conf->reg && last_pos < beg0) continue; // out of range; skip + if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len); + } + last_tid = tid; + last_pos = pos; + } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; + fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; @@ -600,14 +643,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { + int n = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq ? bam_get_qual(p->b)[p->qpos] : 0; if (c >= conf->min_baseQ) - pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); + n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } + if (!n) putc('*', pileup_fp); + + n = 0; putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; @@ -617,9 +664,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); + n++; } } + if (!n) putc('*', pileup_fp); + if (conf->flag & MPLP_PRINT_MAPQ) { + n = 0; putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; @@ -628,19 +679,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); + n++; } + if (!n) putc('*', pileup_fp); } + if (conf->flag & MPLP_PRINT_POS) { + n = 0; putc('\t', pileup_fp); - int last = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; - if (last++) putc(',', pileup_fp); + if (n > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... + n++; } + if (!n) putc('*', pileup_fp); } } } @@ -648,6 +704,27 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } + if (conf->all && !(conf->flag & MPLP_BCF)) { + // Handle terminating region + if (last_tid < 0 && conf->reg && conf->all > 1) { + last_tid = tid0; + last_pos = beg0-1; + mplp_get_ref(data[0], tid0, &ref, &ref_len); + } + while (last_tid >= 0 && last_tid < h->n_targets) { + while (++last_pos < h->target_len[last_tid]) { + if (last_pos >= end0) break; + if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); + } + last_tid++; + last_pos = -1; + if (conf->all < 2 || conf->reg) + break; + } + } + // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); @@ -681,6 +758,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) return ret; } +static int is_url(const char *s) +{ + static const char uri_scheme_chars[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; + return s[strspn(s, uri_scheme_chars)] == ':'; +} + #define MAX_PATH_LEN 1024 int read_file_list(const char *file_list,int *n,char **argv[]) { @@ -710,7 +794,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) // check sanity of the file list buf[len] = 0; - if (stat(buf, &sb) != 0) + if (! (is_url(buf) || stat(buf, &sb) == 0)) { // no such file, check if it is safe to print its name int i, safe_to_print = 1; @@ -814,6 +898,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "Output options for mpileup format (without -g/-v):\n" " -O, --output-BP output base positions on reads\n" " -s, --output-MQ output mapping quality\n" +" -a output all positions (including zero depth)\n" +" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" "\n" "Output options for genotype likelihoods (when -g/-v is used):\n" " -t, --output-tags LIST optional tags to output:\n" @@ -836,7 +922,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" " -P, --platforms STR comma separated list of platforms for indels [all]\n"); - sam_global_opt_help(fp, "-.--."); + sam_global_opt_help(fp, "-.--.-"); fprintf(fp, "\n" "Notes: Assuming diploid individuals.\n"); @@ -862,11 +948,12 @@ int bam_mpileup(int argc, char *argv[]) mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; + mplp.all = 0; sam_global_args_init(&mplp.ga); static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), {"rf", required_argument, NULL, 1}, // require flag {"ff", required_argument, NULL, 2}, // filter flag {"incl-flags", required_argument, NULL, 1}, @@ -916,7 +1003,7 @@ int bam_mpileup(int argc, char *argv[]) {"platforms", required_argument, NULL, 'P'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -988,6 +1075,7 @@ int bam_mpileup(int argc, char *argv[]) } break; case 't': mplp.fmt_flag |= parse_format_flag(optarg); break; + case 'a': mplp.all++; break; default: if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break; /* else fall-through */ diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 650e818..03e5f8a 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -120,7 +121,7 @@ void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all; int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels @@ -211,11 +212,22 @@ static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { return 1; } +static void +print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, + int pos, int n, const char *ref, int ref_len) +{ + int i; + fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + fputs("\t0\t*\t*", fp); + if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); + if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); + } + putc('\n', fp); +} + static int mplp_func(void *data, bam1_t *b) { - extern int bam_realn(bam1_t *b, const char *ref); - extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag); - extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres); char *ref; mplp_aux_t *ma = (mplp_aux_t*)data; int ret, skip = 0, ref_len; @@ -231,7 +243,7 @@ static int mplp_func(void *data, bam1_t *b) } if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } - if (ma->conf->bed) { // test overlap + if (ma->conf->bed && ma->conf->all == 0) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); if (skip) continue; } @@ -260,9 +272,9 @@ static int mplp_func(void *data, bam1_t *b) } skip = 0; - if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); if (has_ref && ma->conf->capQ_thres > 10) { - int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres); + int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; } @@ -310,7 +322,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth; + int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth; const bam_pileup1_t **plp; mplp_ref_t mp_ref = MPLP_REF_INIT; bam_mplp_t iter; @@ -381,7 +393,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(EXIT_FAILURE); } - if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; + if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid; hts_idx_destroy(idx); } else @@ -553,14 +565,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; + int last_tid = -1, last_pos = -1; + // begin pileup while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; mplp_get_ref(data[0], tid, &ref, &ref_len); //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; @@ -586,6 +600,35 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } } else { + if (conf->all) { + // Deal with missing portions of previous tids + while (tid > last_tid) { + if (last_tid >= 0 && !conf->reg) { + while (++last_pos < h->target_len[last_tid]) { + if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); + } + } + last_tid++; + last_pos = -1; + if (conf->all < 2) + break; + } + } + if (conf->all) { + // Deal with missing portion of current tid + while (++last_pos < pos) { + if (conf->reg && last_pos < beg0) continue; // out of range; skip + if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len); + } + last_tid = tid; + last_pos = pos; + } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; + fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; @@ -602,14 +645,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { + int n = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq ? bam_get_qual(p->b)[p->qpos] : 0; if (c >= conf->min_baseQ) - pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); + n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } + if (!n) putc('*', pileup_fp); + + n = 0; putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; @@ -619,9 +666,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); + n++; } } + if (!n) putc('*', pileup_fp); + if (conf->flag & MPLP_PRINT_MAPQ) { + n = 0; putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; @@ -630,19 +681,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); + n++; } + if (!n) putc('*', pileup_fp); } + if (conf->flag & MPLP_PRINT_POS) { + n = 0; putc('\t', pileup_fp); - int last = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; - if (last++) putc(',', pileup_fp); + if (n > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow... + n++; } + if (!n) putc('*', pileup_fp); } } } @@ -650,6 +706,27 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } + if (conf->all && !(conf->flag & MPLP_BCF)) { + // Handle terminating region + if (last_tid < 0 && conf->reg && conf->all > 1) { + last_tid = tid0; + last_pos = beg0-1; + mplp_get_ref(data[0], tid0, &ref, &ref_len); + } + while (last_tid >= 0 && last_tid < h->n_targets) { + while (++last_pos < h->target_len[last_tid]) { + if (last_pos >= end0) break; + if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); + } + last_tid++; + last_pos = -1; + if (conf->all < 2 || conf->reg) + break; + } + } + // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); @@ -683,6 +760,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) return ret; } +static int is_url(const char *s) +{ + static const char uri_scheme_chars[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; + return s[strspn(s, uri_scheme_chars)] == ':'; +} + #define MAX_PATH_LEN 1024 int read_file_list(const char *file_list,int *n,char **argv[]) { @@ -712,7 +796,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) // check sanity of the file list buf[len] = 0; - if (stat(buf, &sb) != 0) + if (! (is_url(buf) || stat(buf, &sb) == 0)) { // no such file, check if it is safe to print its name int i, safe_to_print = 1; @@ -816,6 +900,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "Output options for mpileup format (without -g/-v):\n" " -O, --output-BP output base positions on reads\n" " -s, --output-MQ output mapping quality\n" +" -a output all positions (including zero depth)\n" +" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" "\n" "Output options for genotype likelihoods (when -g/-v is used):\n" " -t, --output-tags LIST optional tags to output:\n" @@ -838,7 +924,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" " -P, --platforms STR comma separated list of platforms for indels [all]\n"); - sam_global_opt_help(fp, "-.--."); + sam_global_opt_help(fp, "-.--.-"); fprintf(fp, "\n" "Notes: Assuming diploid individuals.\n"); @@ -864,11 +950,12 @@ int bam_mpileup(int argc, char *argv[]) mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; + mplp.all = 0; sam_global_args_init(&mplp.ga); static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), {"rf", required_argument, NULL, 1}, // require flag {"ff", required_argument, NULL, 2}, // filter flag {"incl-flags", required_argument, NULL, 1}, @@ -918,7 +1005,7 @@ int bam_mpileup(int argc, char *argv[]) {"platforms", required_argument, NULL, 'P'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -990,6 +1077,7 @@ int bam_mpileup(int argc, char *argv[]) } break; case 't': mplp.fmt_flag |= parse_format_flag(optarg); break; + case 'a': mplp.all++; break; default: if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break; /* else fall-through */ diff --git a/samtools/bam_quickcheck.c b/samtools/bam_quickcheck.c index 6c3c664..02616fe 100644 --- a/samtools/bam_quickcheck.c +++ b/samtools/bam_quickcheck.c @@ -26,7 +26,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include #include #include #include @@ -102,7 +101,7 @@ int main_quickcheck(int argc, char** argv) // attempt to open htsFile *hts_fp = hts_open(fn, "r"); if (hts_fp == NULL) { - if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading\n", fn); + if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading.\n", fn); file_state |= 2; } else { @@ -110,37 +109,54 @@ int main_quickcheck(int argc, char** argv) // make sure we have sequence data const htsFormat *fmt = hts_get_format(hts_fp); if (fmt->category != sequence_data ) { - if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data\n", fn); + if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data.\n", fn); file_state |= 4; } else { if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn); // check header bam_hdr_t *header = sam_hdr_read(hts_fp); - if (header->n_targets <= 0) { - if (verbose >= 2) fprintf(stderr, "%s had no targets in header\n", fn); + if (header == NULL) { + if (verbose >= 2) fprintf(stderr, "%s caused an error whilst reading its header.\n", fn); file_state |= 8; - } - else { - if (verbose >= 3) fprintf(stderr, "%s has %d targets in header\n", fn, header->n_targets); - } - - // only check EOF on BAM for now - // TODO implement and use hts_check_EOF() to include CRAM support - if (fmt->format == bam) { - if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) { - if (verbose >= 2) fprintf(stderr, "%s was missing EOF block\n", fn); - file_state |= 16; + } else { + if (header->n_targets <= 0) { + if (verbose >= 2) fprintf(stderr, "%s had no targets in header.\n", fn); + file_state |= 8; } else { - if (verbose >= 3) fprintf(stderr, "%s has good EOF block\n", fn); + if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, header->n_targets); } + bam_hdr_destroy(header); + } + } + // check EOF on formats that support this + int ret; + if ((ret = hts_check_EOF(hts_fp)) < 0) { + if (verbose >= 2) fprintf(stderr, "%s caused an error whilst checking for EOF block.\n", fn); + file_state |= 16; + } + else { + switch (ret) { + case 0: + if (verbose >= 2) fprintf(stderr, "%s was missing EOF block when one should be present.\n", fn); + file_state |= 16; + break; + case 1: + if (verbose >= 3) fprintf(stderr, "%s has good EOF block.\n", fn); + break; + case 2: + if (verbose >= 3) fprintf(stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn); + break; + case 3: + if (verbose >= 3) fprintf(stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn); + break; } } if (hts_close(hts_fp) < 0) { file_state |= 32; - if (verbose >= 2) fprintf(stderr, "%s did not close cleanly\n", fn); + if (verbose >= 2) fprintf(stderr, "%s did not close cleanly.\n", fn); } } diff --git a/samtools/bam_quickcheck.c.pysam.c b/samtools/bam_quickcheck.c.pysam.c index 26dbeb9..c9dc3d2 100644 --- a/samtools/bam_quickcheck.c.pysam.c +++ b/samtools/bam_quickcheck.c.pysam.c @@ -28,7 +28,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include #include #include #include @@ -104,7 +103,7 @@ int main_quickcheck(int argc, char** argv) // attempt to open htsFile *hts_fp = hts_open(fn, "r"); if (hts_fp == NULL) { - if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading.\n", fn); file_state |= 2; } else { @@ -112,37 +111,54 @@ int main_quickcheck(int argc, char** argv) // make sure we have sequence data const htsFormat *fmt = hts_get_format(hts_fp); if (fmt->category != sequence_data ) { - if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data.\n", fn); file_state |= 4; } else { if (verbose >= 3) fprintf(pysam_stderr, "%s is sequence data\n", fn); // check header bam_hdr_t *header = sam_hdr_read(hts_fp); - if (header->n_targets <= 0) { - if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header\n", fn); + if (header == NULL) { + if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst reading its header.\n", fn); file_state |= 8; - } - else { - if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header\n", fn, header->n_targets); - } - - // only check EOF on BAM for now - // TODO implement and use hts_check_EOF() to include CRAM support - if (fmt->format == bam) { - if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) { - if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block\n", fn); - file_state |= 16; + } else { + if (header->n_targets <= 0) { + if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header.\n", fn); + file_state |= 8; } else { - if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block\n", fn); + if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header.\n", fn, header->n_targets); } + bam_hdr_destroy(header); + } + } + // check EOF on formats that support this + int ret; + if ((ret = hts_check_EOF(hts_fp)) < 0) { + if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst checking for EOF block.\n", fn); + file_state |= 16; + } + else { + switch (ret) { + case 0: + if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block when one should be present.\n", fn); + file_state |= 16; + break; + case 1: + if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block.\n", fn); + break; + case 2: + if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn); + break; + case 3: + if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn); + break; } } if (hts_close(hts_fp) < 0) { file_state |= 32; - if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly.\n", fn); } } diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c index 0469c06..acaebd4 100644 --- a/samtools/bam_reheader.c +++ b/samtools/bam_reheader.c @@ -91,7 +91,7 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, goto fail; } if (in->block_offset < in->block_length) { - if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; if (bgzf_flush(fp) < 0) goto write_fail; } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { @@ -246,7 +246,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); // Zero the remaining block - memset(cram_block_get_data(b)+cram_block_get_offset(b), 0, + memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0, cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); // Make sure all sizes and byte-offsets are consistent after memset cram_block_set_offset(b, cram_block_get_uncomp_size(b)); diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index 16990e6..18cb6c4 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -93,7 +93,7 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, goto fail; } if (in->block_offset < in->block_length) { - if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; if (bgzf_flush(fp) < 0) goto write_fail; } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { @@ -248,7 +248,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); // Zero the remaining block - memset(cram_block_get_data(b)+cram_block_get_offset(b), 0, + memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0, cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); // Make sure all sizes and byte-offsets are consistent after memset cram_block_set_offset(b, cram_block_get_uncomp_size(b)); @@ -436,7 +436,7 @@ int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, } } -static void usage(FILE *fp, int ret) { +static int usage(FILE *fp, int ret) { fprintf(fp, "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n" " or samtools reheader [-P] -i in.header.sam file.bam\n" @@ -445,7 +445,7 @@ static void usage(FILE *fp, int ret) { " -P, --no-PG Do not generate an @PG header line.\n" " -i, --in-place Modify the bam/cram file directly.\n" " (Defaults to outputting to pysam_stdout.)\n"); - exit(ret); + return(ret); } int main_reheader(int argc, char *argv[]) @@ -466,15 +466,15 @@ int main_reheader(int argc, char *argv[]) switch (c) { case 'P': add_PG = 0; break; case 'i': inplace = 1; break; - case 'h': usage(pysam_stdout, 0); break; + case 'h': return(usage(pysam_stdout, 0)); break; default: fprintf(pysam_stderr, "Invalid option '%c'\n", c); - usage(pysam_stderr, 1); + return(usage(pysam_stderr, 1)); } } if (argc - optind != 2) - usage(pysam_stderr, 1); + return(usage(pysam_stderr, 1)); { // read the header samFile *fph = sam_open(argv[optind], "r"); diff --git a/samtools/bam_rmdup.c b/samtools/bam_rmdup.c index 57612b4..513848d 100644 --- a/samtools/bam_rmdup.c +++ b/samtools/bam_rmdup.c @@ -258,7 +258,7 @@ static int rmdup_usage(void) { fprintf(stderr, "Option: -s rmdup for SE reads\n"); fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); - sam_global_opt_help(stderr, "-...."); + sam_global_opt_help(stderr, "-....-"); return 1; } @@ -271,7 +271,7 @@ int bam_rmdup(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), { NULL, 0, NULL, 0 } }; diff --git a/samtools/bam_rmdup.c.pysam.c b/samtools/bam_rmdup.c.pysam.c index 3c16025..6742fc8 100644 --- a/samtools/bam_rmdup.c.pysam.c +++ b/samtools/bam_rmdup.c.pysam.c @@ -260,7 +260,7 @@ static int rmdup_usage(void) { fprintf(pysam_stderr, "Option: -s rmdup for SE reads\n"); fprintf(pysam_stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); - sam_global_opt_help(pysam_stderr, "-...."); + sam_global_opt_help(pysam_stderr, "-....-"); return 1; } @@ -273,7 +273,7 @@ int bam_rmdup(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), { NULL, 0, NULL, 0 } }; diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 4955dcc..be9789c 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -43,6 +43,17 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "sam_opts.h" +#include "samtools.h" + +/* Minimum memory required in megabytes before sort will attempt to run. This + is to prevent accidents where failing to use the -m option correctly results + in the creation of a temporary file for each read in the input file. + Don't forget to update the man page if you change this. */ +const size_t SORT_MIN_MEGS_PER_THREAD = 1; + +/* Default per-thread memory for sort. Must be >= SORT_MIN_MEGS_PER_THREAD. + Don't forget to update the man page if you change this. */ +const size_t SORT_DEFAULT_MEGS_PER_THREAD = 768; #if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L #define NEED_MEMSET_PATTERN4 @@ -1098,6 +1109,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) + @param cmd command name (used in print_error() etc) @param in_fmt format options for input files @param out_fmt output file format and options @discussion Padding information may NOT correctly maintained. This @@ -1105,7 +1117,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, - const char *reg, int n_threads, + const char *reg, int n_threads, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt) { samFile *fpout, **fp = NULL; @@ -1126,25 +1138,15 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { - const char *message = strerror(errno); - fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); + print_error_errno(cmd, "cannot open \"%s\"", headers); return -1; } hin = sam_hdr_read(fpheaders); sam_close(fpheaders); if (hin == NULL) { - fprintf(stderr, "[bam_merge_core] couldn't read headers for '%s'\n", - headers); - goto mem_fail; - } - } else { - hout = bam_hdr_init(); - if (!hout) { - fprintf(stderr, "[bam_merge_core] couldn't allocate bam header\n"); + print_error(cmd, "couldn't read headers from \"%s\"", headers); goto mem_fail; } - hout->text = strdup(""); - if (!hout->text) goto mem_fail; } g_is_by_qname = by_qname; @@ -1194,13 +1196,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_hdr_t *hin; fp[i] = sam_open_format(fn[i], "r", in_fmt); if (fp[i] == NULL) { - fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); + print_error_errno(cmd, "fail to open \"%s\"", fn[i]); goto fail; } hin = sam_hdr_read(fp[i]); if (hin == NULL) { - fprintf(stderr, "[bam_merge_core] failed to read header for '%s'\n", - fn[i]); + print_error(cmd, "failed to read header from \"%s\"", fn[i]); goto fail; } @@ -1218,6 +1219,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } + + // Potential future improvement is to share headers between CRAM files for + // samtools sort (where all headers are identical. + // Eg: + // + // if (i > 1) { + // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram)); + // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram)); + // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram)); + // } } // Did we get an @HD line? @@ -1326,19 +1337,18 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_destroy1(h->b); h->b = NULL; } else { - fprintf(stderr, "[%s] failed to read first record from %s\n", - __func__, fn[i]); + print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; } } // Open output file and write header if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) { - fprintf(stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno)); + print_error_errno(cmd, "failed to create \"%s\"", out); return -1; } if (sam_hdr_write(fpout, hout) != 0) { - fprintf(stderr, "[%s] failed to write header.\n", __func__); + print_error_errno(cmd, "failed to write header to \"%s\"", out); sam_close(fpout); return -1; } @@ -1354,7 +1364,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } if (sam_write1(fpout, hout, b) < 0) { - fprintf(stderr, "[%s] failed to write to output file.\n", __func__); + print_error_errno(cmd, "failed writing to \"%s\"", out); sam_close(fpout); return -1; } @@ -1367,8 +1377,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_destroy1(heap->b); heap->b = NULL; } else { - fprintf(stderr, "[bam_merge_core] error: '%s' is truncated.\n", - fn[heap->i]); + print_error(cmd, "\"%s\" is truncated", fn[heap->i]); goto fail; } ks_heapadjust(heap, 0, n, heap); @@ -1390,13 +1399,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, free_merged_header(merged_hdr); free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); if (sam_close(fpout) < 0) { - fprintf(stderr, "[bam_merge_core] error closing output file\n"); + print_error(cmd, "error closing output file"); return -1; } return 0; mem_fail: - fprintf(stderr, "[bam_merge_core] Out of memory\n"); + print_error(cmd, "Out of memory"); fail: if (flag & MERGE_RG) { @@ -1430,7 +1439,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); - return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL); + return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL); } static void merge_usage(FILE *to) @@ -1450,15 +1459,13 @@ static void merge_usage(FILE *to) " -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n" " -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n" " -s VALUE Override random seed\n" -" -b FILE List of input BAM filenames, one per line [null]\n" -" -@, --threads INT\n" -" Number of BAM/CRAM compression threads [0]\n"); - sam_global_opt_help(to, "-.O.."); +" -b FILE List of input BAM filenames, one per line [null]\n"); + sam_global_opt_help(to, "-.O..@"); } int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; + int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; char *fn_headers = NULL, *reg = NULL, mode[12]; long random_seed = (long)time(NULL); char** fn = NULL; @@ -1466,7 +1473,7 @@ int bam_merge(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; @@ -1486,7 +1493,6 @@ int bam_merge(int argc, char *argv[]) case 'u': flag |= MERGE_UNCOMP; level = 0; break; case 'R': reg = strdup(optarg); break; case 'l': level = atoi(optarg); break; - case '@': n_threads = atoi(optarg); break; case 'c': flag |= MERGE_COMBINE_RG; break; case 'p': flag |= MERGE_COMBINE_PG; break; case 's': random_seed = atol(optarg); break; @@ -1500,9 +1506,10 @@ int bam_merge(int argc, char *argv[]) if (fn == NULL) { ret = 1; goto end; } memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*)); fn_size += nfiles; + free(fn_read); } else { - fprintf(stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg); + print_error("merge", "Invalid file list \"%s\"", optarg); ret = 1; } break; @@ -1514,7 +1521,7 @@ int bam_merge(int argc, char *argv[]) } } if ( argc - optind < 1 ) { - fprintf(stderr, "You must at least specify the output file.\n"); + print_error("merge", "You must at least specify the output file"); merge_usage(stderr); return 1; } @@ -1537,7 +1544,7 @@ int bam_merge(int argc, char *argv[]) memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); } if (fn_size+nargcfiles < 1) { - fprintf(stderr, "You must specify at least one (and usually two or more) input files.\n"); + print_error("merge", "You must specify at least one (and usually two or more) input files"); merge_usage(stderr); return 1; } @@ -1545,8 +1552,8 @@ int bam_merge(int argc, char *argv[]) sam_open_mode(mode+1, argv[optind], NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers, - fn_size+nargcfiles, fn, flag, reg, n_threads, - &ga.in, &ga.out) < 0) + fn_size+nargcfiles, fn, flag, reg, ga.nthreads, + "merge", &ga.in, &ga.out) < 0) ret = 1; end: @@ -1651,18 +1658,30 @@ static void *worker(void *data) name = (char*)calloc(strlen(w->prefix) + 20, 1); if (!name) { w->error = errno; return 0; } sprintf(name, "%s.%.4d.bam", w->prefix, w->index); - if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) - w->error = errno; - -// Consider using CRAM temporary files if the final output is CRAM. -// Typically it is comparable speed while being smaller. -// hts_opt opt[2] = { -// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL}, -// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL} -// }; -// opt[0].next = &opt[1]; -// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0) -// w->error = errno; + + uint32_t max_ncigar = 0; + int i; + for (i = 0; i < w->buf_len; i++) { + uint32_t nc = w->buf[i]->core.n_cigar; + if (max_ncigar < nc) + max_ncigar = nc; + } + + if (max_ncigar > 65535) { + htsFormat fmt; + memset(&fmt, 0, sizeof(fmt)); + if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) { + w->error = errno; + free(name); + return 0; + } + + if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0) + w->error = errno; + } else { + if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) + w->error = errno; + } free(name); return 0; @@ -1697,7 +1716,8 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c for (i = 0; i < n_threads; ++i) { pthread_join(tid[i], 0); if (w[i].error != 0) { - fprintf(stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error)); + errno = w[i].error; + print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index); n_failed++; } } @@ -1741,17 +1761,23 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, buf = NULL; fp = sam_open_format(fn, "r", in_fmt); if (fp == NULL) { - const char *message = strerror(errno); - fprintf(stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message); + print_error_errno("sort", "can't open \"%s\"", fn); return -2; } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(stderr, "[bam_sort_core] failed to read header for '%s'\n", fn); + print_error("sort", "failed to read header from \"%s\"", fn); goto err; } if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); + + // No gain to using the thread pool here as the flow of this code + // is such that we are *either* reading *or* sorting. Hence a shared + // pool makes no real difference except to reduce the thread count a little. + if (n_threads > 1) + hts_set_threads(fp, n_threads); + // write sub files for (;;) { if (k == max_k) { @@ -1780,7 +1806,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, } } if (ret != -1) { - fprintf(stderr, "[bam_sort_core] truncated file. Aborting.\n"); + print_error("sort", "truncated file. Aborting"); ret = -1; goto err; } @@ -1789,7 +1815,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, if (n_files == 0) { // a single block ks_mergesort(sort, k, buf, 0); if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { - fprintf(stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno)); + print_error_errno("sort", "failed to create \"%s\"", fnout); ret = -1; goto err; } @@ -1808,7 +1834,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, } if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO, - NULL, n_threads, in_fmt, out_fmt) < 0) { + NULL, n_threads, "sort", in_fmt, out_fmt) < 0) { // Propagate bam_merge_core2() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -1851,23 +1877,38 @@ static void sort_usage(FILE *fp) " -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n" " -n Sort by read name\n" " -o FILE Write final output to FILE rather than standard output\n" -" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" -" -@, --threads INT\n" -" Set number of sorting and compression threads [1]\n"); - sam_global_opt_help(fp, "-.O.."); +" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"); + sam_global_opt_help(fp, "-.O..@"); +} + +static void complain_about_memory_setting(size_t max_mem) { + char *suffix = ""; + const size_t nine_k = 9<<10; + if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; } + if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; } + + fprintf(stderr, +"[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n" +"Trying to run with -m too small can lead to the creation of a very large number\n" +"of temporary files. This may make sort fail due to it exceeding limits on the\n" +"number of files it can have open at the same time.\n\n" +"Please check your -m parameter. It should be an integer followed by one of the\n" +"letters K (for kilobytes), M (megabytes) or G (gigabytes). You should ensure it\n" +"is at least the minimum above, and much higher if you are sorting a large file.\n", + max_mem, suffix, SORT_MIN_MEGS_PER_THREAD); } int bam_sort(int argc, char *argv[]) { - size_t max_mem = 768<<20; // 512MB - int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1; + size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; + int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; struct stat st; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; @@ -1885,7 +1926,6 @@ int bam_sort(int argc, char *argv[]) break; } case 'T': kputs(optarg, &tmpprefix); break; - case '@': n_threads = atoi(optarg); break; case 'l': level = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; @@ -1910,6 +1950,12 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } + if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) { + complain_about_memory_setting(max_mem); + ret = EXIT_FAILURE; + goto sort_end; + } + strcpy(modeout, "wb"); sam_open_mode(modeout+1, fnout, NULL); if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9); @@ -1925,7 +1971,7 @@ int bam_sort(int argc, char *argv[]) } ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, n_threads, + tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, &ga.in, &ga.out); if (ret >= 0) ret = EXIT_SUCCESS; diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index b2b625d..ea2a30d 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -45,6 +45,17 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "sam_opts.h" +#include "samtools.h" + +/* Minimum memory required in megabytes before sort will attempt to run. This + is to prevent accidents where failing to use the -m option correctly results + in the creation of a temporary file for each read in the input file. + Don't forget to update the man page if you change this. */ +const size_t SORT_MIN_MEGS_PER_THREAD = 1; + +/* Default per-thread memory for sort. Must be >= SORT_MIN_MEGS_PER_THREAD. + Don't forget to update the man page if you change this. */ +const size_t SORT_DEFAULT_MEGS_PER_THREAD = 768; #if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L #define NEED_MEMSET_PATTERN4 @@ -1100,6 +1111,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) + @param cmd command name (used in print_error() etc) @param in_fmt format options for input files @param out_fmt output file format and options @discussion Padding information may NOT correctly maintained. This @@ -1107,7 +1119,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, - const char *reg, int n_threads, + const char *reg, int n_threads, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt) { samFile *fpout, **fp = NULL; @@ -1128,25 +1140,15 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { - const char *message = strerror(errno); - fprintf(pysam_stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); + print_error_errno(cmd, "cannot open \"%s\"", headers); return -1; } hin = sam_hdr_read(fpheaders); sam_close(fpheaders); if (hin == NULL) { - fprintf(pysam_stderr, "[bam_merge_core] couldn't read headers for '%s'\n", - headers); - goto mem_fail; - } - } else { - hout = bam_hdr_init(); - if (!hout) { - fprintf(pysam_stderr, "[bam_merge_core] couldn't allocate bam header\n"); + print_error(cmd, "couldn't read headers from \"%s\"", headers); goto mem_fail; } - hout->text = strdup(""); - if (!hout->text) goto mem_fail; } g_is_by_qname = by_qname; @@ -1196,13 +1198,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_hdr_t *hin; fp[i] = sam_open_format(fn[i], "r", in_fmt); if (fp[i] == NULL) { - fprintf(pysam_stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); + print_error_errno(cmd, "fail to open \"%s\"", fn[i]); goto fail; } hin = sam_hdr_read(fp[i]); if (hin == NULL) { - fprintf(pysam_stderr, "[bam_merge_core] failed to read header for '%s'\n", - fn[i]); + print_error(cmd, "failed to read header from \"%s\"", fn[i]); goto fail; } @@ -1220,6 +1221,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysam_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } + + // Potential future improvement is to share headers between CRAM files for + // samtools sort (where all headers are identical. + // Eg: + // + // if (i > 1) { + // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram)); + // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram)); + // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram)); + // } } // Did we get an @HD line? @@ -1328,19 +1339,18 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_destroy1(h->b); h->b = NULL; } else { - fprintf(pysam_stderr, "[%s] failed to read first record from %s\n", - __func__, fn[i]); + print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; } } // Open output file and write header if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) { - fprintf(pysam_stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno)); + print_error_errno(cmd, "failed to create \"%s\"", out); return -1; } if (sam_hdr_write(fpout, hout) != 0) { - fprintf(pysam_stderr, "[%s] failed to write header.\n", __func__); + print_error_errno(cmd, "failed to write header to \"%s\"", out); sam_close(fpout); return -1; } @@ -1356,7 +1366,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } if (sam_write1(fpout, hout, b) < 0) { - fprintf(pysam_stderr, "[%s] failed to write to output file.\n", __func__); + print_error_errno(cmd, "failed writing to \"%s\"", out); sam_close(fpout); return -1; } @@ -1369,8 +1379,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_destroy1(heap->b); heap->b = NULL; } else { - fprintf(pysam_stderr, "[bam_merge_core] error: '%s' is truncated.\n", - fn[heap->i]); + print_error(cmd, "\"%s\" is truncated", fn[heap->i]); goto fail; } ks_heapadjust(heap, 0, n, heap); @@ -1392,13 +1401,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, free_merged_header(merged_hdr); free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); if (sam_close(fpout) < 0) { - fprintf(pysam_stderr, "[bam_merge_core] error closing output file\n"); + print_error(cmd, "error closing output file"); return -1; } return 0; mem_fail: - fprintf(pysam_stderr, "[bam_merge_core] Out of memory\n"); + print_error(cmd, "Out of memory"); fail: if (flag & MERGE_RG) { @@ -1432,7 +1441,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); - return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL); + return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL); } static void merge_usage(FILE *to) @@ -1452,15 +1461,13 @@ static void merge_usage(FILE *to) " -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n" " -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n" " -s VALUE Override random seed\n" -" -b FILE List of input BAM filenames, one per line [null]\n" -" -@, --threads INT\n" -" Number of BAM/CRAM compression threads [0]\n"); - sam_global_opt_help(to, "-.O.."); +" -b FILE List of input BAM filenames, one per line [null]\n"); + sam_global_opt_help(to, "-.O..@"); } int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; + int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; char *fn_headers = NULL, *reg = NULL, mode[12]; long random_seed = (long)time(NULL); char** fn = NULL; @@ -1468,7 +1475,7 @@ int bam_merge(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; @@ -1488,7 +1495,6 @@ int bam_merge(int argc, char *argv[]) case 'u': flag |= MERGE_UNCOMP; level = 0; break; case 'R': reg = strdup(optarg); break; case 'l': level = atoi(optarg); break; - case '@': n_threads = atoi(optarg); break; case 'c': flag |= MERGE_COMBINE_RG; break; case 'p': flag |= MERGE_COMBINE_PG; break; case 's': random_seed = atol(optarg); break; @@ -1502,9 +1508,10 @@ int bam_merge(int argc, char *argv[]) if (fn == NULL) { ret = 1; goto end; } memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*)); fn_size += nfiles; + free(fn_read); } else { - fprintf(pysam_stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg); + print_error("merge", "Invalid file list \"%s\"", optarg); ret = 1; } break; @@ -1516,7 +1523,7 @@ int bam_merge(int argc, char *argv[]) } } if ( argc - optind < 1 ) { - fprintf(pysam_stderr, "You must at least specify the output file.\n"); + print_error("merge", "You must at least specify the output file"); merge_usage(pysam_stderr); return 1; } @@ -1539,7 +1546,7 @@ int bam_merge(int argc, char *argv[]) memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); } if (fn_size+nargcfiles < 1) { - fprintf(pysam_stderr, "You must specify at least one (and usually two or more) input files.\n"); + print_error("merge", "You must specify at least one (and usually two or more) input files"); merge_usage(pysam_stderr); return 1; } @@ -1547,8 +1554,8 @@ int bam_merge(int argc, char *argv[]) sam_open_mode(mode+1, argv[optind], NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers, - fn_size+nargcfiles, fn, flag, reg, n_threads, - &ga.in, &ga.out) < 0) + fn_size+nargcfiles, fn, flag, reg, ga.nthreads, + "merge", &ga.in, &ga.out) < 0) ret = 1; end: @@ -1653,18 +1660,30 @@ static void *worker(void *data) name = (char*)calloc(strlen(w->prefix) + 20, 1); if (!name) { w->error = errno; return 0; } sprintf(name, "%s.%.4d.bam", w->prefix, w->index); - if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) - w->error = errno; - -// Consider using CRAM temporary files if the final output is CRAM. -// Typically it is comparable speed while being smaller. -// hts_opt opt[2] = { -// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL}, -// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL} -// }; -// opt[0].next = &opt[1]; -// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0) -// w->error = errno; + + uint32_t max_ncigar = 0; + int i; + for (i = 0; i < w->buf_len; i++) { + uint32_t nc = w->buf[i]->core.n_cigar; + if (max_ncigar < nc) + max_ncigar = nc; + } + + if (max_ncigar > 65535) { + htsFormat fmt; + memset(&fmt, 0, sizeof(fmt)); + if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) { + w->error = errno; + free(name); + return 0; + } + + if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0) + w->error = errno; + } else { + if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) + w->error = errno; + } free(name); return 0; @@ -1699,7 +1718,8 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c for (i = 0; i < n_threads; ++i) { pthread_join(tid[i], 0); if (w[i].error != 0) { - fprintf(pysam_stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error)); + errno = w[i].error; + print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index); n_failed++; } } @@ -1743,17 +1763,23 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, buf = NULL; fp = sam_open_format(fn, "r", in_fmt); if (fp == NULL) { - const char *message = strerror(errno); - fprintf(pysam_stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message); + print_error_errno("sort", "can't open \"%s\"", fn); return -2; } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(pysam_stderr, "[bam_sort_core] failed to read header for '%s'\n", fn); + print_error("sort", "failed to read header from \"%s\"", fn); goto err; } if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); + + // No gain to using the thread pool here as the flow of this code + // is such that we are *either* reading *or* sorting. Hence a shared + // pool makes no real difference except to reduce the thread count a little. + if (n_threads > 1) + hts_set_threads(fp, n_threads); + // write sub files for (;;) { if (k == max_k) { @@ -1782,7 +1808,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, } } if (ret != -1) { - fprintf(pysam_stderr, "[bam_sort_core] truncated file. Aborting.\n"); + print_error("sort", "truncated file. Aborting"); ret = -1; goto err; } @@ -1791,7 +1817,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, if (n_files == 0) { // a single block ks_mergesort(sort, k, buf, 0); if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { - fprintf(pysam_stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno)); + print_error_errno("sort", "failed to create \"%s\"", fnout); ret = -1; goto err; } @@ -1810,7 +1836,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, } if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO, - NULL, n_threads, in_fmt, out_fmt) < 0) { + NULL, n_threads, "sort", in_fmt, out_fmt) < 0) { // Propagate bam_merge_core2() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -1853,23 +1879,38 @@ static void sort_usage(FILE *fp) " -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n" " -n Sort by read name\n" " -o FILE Write final output to FILE rather than standard output\n" -" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" -" -@, --threads INT\n" -" Set number of sorting and compression threads [1]\n"); - sam_global_opt_help(fp, "-.O.."); +" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"); + sam_global_opt_help(fp, "-.O..@"); +} + +static void complain_about_memory_setting(size_t max_mem) { + char *suffix = ""; + const size_t nine_k = 9<<10; + if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; } + if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; } + + fprintf(pysam_stderr, +"[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n" +"Trying to run with -m too small can lead to the creation of a very large number\n" +"of temporary files. This may make sort fail due to it exceeding limits on the\n" +"number of files it can have open at the same time.\n\n" +"Please check your -m parameter. It should be an integer followed by one of the\n" +"letters K (for kilobytes), M (megabytes) or G (gigabytes). You should ensure it\n" +"is at least the minimum above, and much higher if you are sorting a large file.\n", + max_mem, suffix, SORT_MIN_MEGS_PER_THREAD); } int bam_sort(int argc, char *argv[]) { - size_t max_mem = 768<<20; // 512MB - int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1; + size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; + int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; struct stat st; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; @@ -1887,7 +1928,6 @@ int bam_sort(int argc, char *argv[]) break; } case 'T': kputs(optarg, &tmpprefix); break; - case '@': n_threads = atoi(optarg); break; case 'l': level = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; @@ -1912,6 +1952,12 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } + if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) { + complain_about_memory_setting(max_mem); + ret = EXIT_FAILURE; + goto sort_end; + } + strcpy(modeout, "wb"); sam_open_mode(modeout+1, fnout, NULL); if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9); @@ -1927,7 +1973,7 @@ int bam_sort(int argc, char *argv[]) } ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, n_threads, + tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, &ga.in, &ga.out); if (ret >= 0) ret = EXIT_SUCCESS; diff --git a/samtools/bam_split.c b/samtools/bam_split.c index 9a2998a..9bb2030 100644 --- a/samtools/bam_split.c +++ b/samtools/bam_split.c @@ -1,6 +1,6 @@ /* bam_split.c -- split subcommand. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Martin Pollard @@ -34,7 +34,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include +#include "htslib/thread_pool.h" #include "sam_opts.h" +#include "samtools.h" KHASH_MAP_INIT_STR(c2i, int) @@ -61,6 +64,7 @@ struct state { samFile** rg_output_file; bam_hdr_t** rg_output_header; kh_c2i_t* rg_hash; + htsThreadPool p; }; typedef struct state state_t; @@ -78,7 +82,7 @@ static void usage(FILE *write_to) " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" " -u FILE1:FILE2 ...and override the header with FILE2\n" " -v verbose output\n"); - sam_global_opt_help(write_to, "-...."); + sam_global_opt_help(write_to, "-....@"); fprintf(write_to, "\n" "Format string expansions:\n" @@ -95,11 +99,11 @@ static parsed_opts_t* parse_args(int argc, char** argv) { if (argc == 1) { usage(stdout); return NULL; } - const char* optstring = "vf:u:"; + const char* optstring = "vf:u:@:"; char* delim; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), { NULL, 0, NULL, 0 } }; @@ -143,7 +147,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) argv += optind; if (argc != 1) { - fprintf(stderr, "Invalid number of arguments: %d\n", argc); + print_error("split", "Invalid number of arguments: %d", argc); usage(stderr); free(retval); return NULL; @@ -270,7 +274,7 @@ static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) // Filters a header of @RG lines where ID != id_keep // TODO: strip @PG's descended from other RGs and their descendants -static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep) +static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list) { kstring_t str = {0, 0, NULL}; @@ -315,28 +319,52 @@ static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep) free(hdr->text); hdr->text = ks_release(&str); + // Add the PG line + SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text); + if (sam_hdr_add_PG(sh, "samtools", + "VN", samtools_version(), + arg_list ? "CL": NULL, + arg_list ? arg_list : NULL, + NULL) != 0) + return -1; + + free(hdr->text); + hdr->text = strdup(sam_hdr_str(sh)); + hdr->l_text = sam_hdr_length(sh); + if (!hdr->text) + return false; + sam_hdr_free(sh); + return true; } // Set the initial state -static state_t* init(parsed_opts_t* opts) +static state_t* init(parsed_opts_t* opts, const char *arg_list) { state_t* retval = calloc(sizeof(state_t), 1); if (!retval) { - fprintf(stderr, "Out of memory"); + print_error_errno("split", "Initialisation failed"); return NULL; } + if (opts->ga.nthreads > 0) { + if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { + fprintf(stderr, "Error creating thread pool\n"); + return NULL; + } + } + retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); if (!retval->merged_input_file) { - fprintf(stderr, "Could not open input file (%s)\n", opts->merged_input_name); + print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); free(retval); return NULL; } + if (retval->p.pool) + hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p); retval->merged_input_header = sam_hdr_read(retval->merged_input_file); if (retval->merged_input_header == NULL) { - fprintf(stderr, "Could not read header for file '%s'\n", - opts->merged_input_name); + print_error("split", "Could not read header from \"%s\"", opts->merged_input_name); cleanup_state(retval, false); return NULL; } @@ -345,14 +373,13 @@ static state_t* init(parsed_opts_t* opts) if (opts->unaccounted_header_name) { samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in); if (!hdr_load) { - fprintf(stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name); + print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name); cleanup_state(retval, false); return NULL; } retval->unaccounted_header = sam_hdr_read(hdr_load); if (retval->unaccounted_header == NULL) { - fprintf(stderr, "Could not read header for file '%s'\n", - opts->unaccounted_header_name); + print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name); cleanup_state(retval, false); return NULL; } @@ -363,10 +390,12 @@ static state_t* init(parsed_opts_t* opts) retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); if (retval->unaccounted_file == NULL) { - fprintf(stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name); + print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name); cleanup_state(retval, false); return NULL; } + if (retval->p.pool) + hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p); } // Open output files for RGs @@ -378,7 +407,7 @@ static state_t* init(parsed_opts_t* opts) retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); retval->rg_hash = kh_init_c2i(); if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { - fprintf(stderr, "Could not allocate memory for output file array. Out of memory?"); + print_error_errno("split", "Could not initialise output file array"); cleanup_state(retval, false); return NULL; } @@ -386,7 +415,7 @@ static state_t* init(parsed_opts_t* opts) char* dirsep = strrchr(opts->merged_input_name, '/'); char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); if (!input_base_name) { - fprintf(stderr, "Out of memory\n"); + print_error_errno("split", "Filename manipulation failed"); cleanup_state(retval, false); return NULL; } @@ -403,7 +432,7 @@ static state_t* init(parsed_opts_t* opts) &opts->ga.out); if ( output_filename == NULL ) { - fprintf(stderr, "Error expanding output filename format string.\n"); + print_error("split", "Error expanding output filename format string"); cleanup_state(retval, false); free(input_base_name); return NULL; @@ -412,11 +441,13 @@ static state_t* init(parsed_opts_t* opts) retval->rg_output_file_name[i] = output_filename; retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out); if (retval->rg_output_file[i] == NULL) { - fprintf(stderr, "Could not open output file: %s\n", output_filename); + print_error_errno("split", "Could not open \"%s\"", output_filename); cleanup_state(retval, false); free(input_base_name); return NULL; } + if (retval->p.pool) + hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p); // Record index in hash int ret; @@ -425,8 +456,8 @@ static state_t* init(parsed_opts_t* opts) // Set and edit header retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); - if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) { - fprintf(stderr, "Could not rewrite header for file: %s\n", output_filename); + if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) { + print_error("split", "Could not rewrite header for \"%s\"", output_filename); cleanup_state(retval, false); free(input_base_name); return NULL; @@ -441,14 +472,13 @@ static state_t* init(parsed_opts_t* opts) static bool split(state_t* state) { if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { - fprintf(stderr, "Could not write output file header\n"); + print_error_errno("split", "Could not write output file header"); return false; } size_t i; for (i = 0; i < state->output_count; i++) { if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { - fprintf(stderr, "Could not write output file header for '%s'\n", - state->rg_output_file_name[i]); + print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); return false; } } @@ -461,7 +491,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(stderr, "Could not read first input record\n"); + print_error("split", "Could not read first input record"); return false; } } @@ -482,8 +512,7 @@ static bool split(state_t* state) // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) { - fprintf(stderr, "Could not write to output file '%s'\n", - state->rg_output_file_name[i]); + print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]); bam_destroy1(file_read); return false; } @@ -499,7 +528,7 @@ static bool split(state_t* state) return false; } else { if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) { - fprintf(stderr, "Could not write to unaccounted output file\n"); + print_error_errno("split", "Could not write to unaccounted output file"); bam_destroy1(file_read); return false; } @@ -512,7 +541,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(stderr, "Could not read input record\n"); + print_error("split", "Could not read input record"); return false; } } @@ -529,7 +558,7 @@ static int cleanup_state(state_t* status, bool check_close) if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); if (status->unaccounted_file) { if (sam_close(status->unaccounted_file) < 0 && check_close) { - fprintf(stderr, "Error on closing unaccounted file\n"); + print_error("split", "Error on closing unaccounted file"); ret = -1; } } @@ -540,8 +569,7 @@ static int cleanup_state(state_t* status, bool check_close) bam_hdr_destroy(status->rg_output_header[i]); if (status->rg_output_file && status->rg_output_file[i]) { if (sam_close(status->rg_output_file[i]) < 0 && check_close) { - fprintf(stderr, "Error on closing output file '%s'\n", - status->rg_output_file_name[i]); + print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); ret = -1; } } @@ -557,6 +585,9 @@ static int cleanup_state(state_t* status, bool check_close) free(status->rg_id); free(status); + if (status->p.pool) + hts_tpool_destroy(status->p.pool); + return ret; } @@ -574,9 +605,10 @@ static void cleanup_opts(parsed_opts_t* opts) int main_split(int argc, char** argv) { int ret = 1; + char *arg_list = stringify_argv(argc+1, argv-1); parsed_opts_t* opts = parse_args(argc, argv); if (!opts) goto cleanup_opts; - state_t* status = init(opts); + state_t* status = init(opts, arg_list); if (!status) goto cleanup_opts; if (!split(status)) { @@ -588,6 +620,7 @@ int main_split(int argc, char** argv) cleanup_opts: cleanup_opts(opts); + free(arg_list); return ret; } diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c index 2348f48..8a584ed 100644 --- a/samtools/bam_split.c.pysam.c +++ b/samtools/bam_split.c.pysam.c @@ -2,7 +2,7 @@ /* bam_split.c -- split subcommand. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Martin Pollard @@ -36,7 +36,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include +#include "htslib/thread_pool.h" #include "sam_opts.h" +#include "samtools.h" KHASH_MAP_INIT_STR(c2i, int) @@ -63,6 +66,7 @@ struct state { samFile** rg_output_file; bam_hdr_t** rg_output_header; kh_c2i_t* rg_hash; + htsThreadPool p; }; typedef struct state state_t; @@ -80,7 +84,7 @@ static void usage(FILE *write_to) " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" " -u FILE1:FILE2 ...and override the header with FILE2\n" " -v verbose output\n"); - sam_global_opt_help(write_to, "-...."); + sam_global_opt_help(write_to, "-....@"); fprintf(write_to, "\n" "Format string expansions:\n" @@ -97,11 +101,11 @@ static parsed_opts_t* parse_args(int argc, char** argv) { if (argc == 1) { usage(pysam_stdout); return NULL; } - const char* optstring = "vf:u:"; + const char* optstring = "vf:u:@:"; char* delim; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), { NULL, 0, NULL, 0 } }; @@ -145,7 +149,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) argv += optind; if (argc != 1) { - fprintf(pysam_stderr, "Invalid number of arguments: %d\n", argc); + print_error("split", "Invalid number of arguments: %d", argc); usage(pysam_stderr); free(retval); return NULL; @@ -272,7 +276,7 @@ static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) // Filters a header of @RG lines where ID != id_keep // TODO: strip @PG's descended from other RGs and their descendants -static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep) +static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list) { kstring_t str = {0, 0, NULL}; @@ -317,28 +321,52 @@ static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep) free(hdr->text); hdr->text = ks_release(&str); + // Add the PG line + SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text); + if (sam_hdr_add_PG(sh, "samtools", + "VN", samtools_version(), + arg_list ? "CL": NULL, + arg_list ? arg_list : NULL, + NULL) != 0) + return -1; + + free(hdr->text); + hdr->text = strdup(sam_hdr_str(sh)); + hdr->l_text = sam_hdr_length(sh); + if (!hdr->text) + return false; + sam_hdr_free(sh); + return true; } // Set the initial state -static state_t* init(parsed_opts_t* opts) +static state_t* init(parsed_opts_t* opts, const char *arg_list) { state_t* retval = calloc(sizeof(state_t), 1); if (!retval) { - fprintf(pysam_stderr, "Out of memory"); + print_error_errno("split", "Initialisation failed"); return NULL; } + if (opts->ga.nthreads > 0) { + if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { + fprintf(pysam_stderr, "Error creating thread pool\n"); + return NULL; + } + } + retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); if (!retval->merged_input_file) { - fprintf(pysam_stderr, "Could not open input file (%s)\n", opts->merged_input_name); + print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); free(retval); return NULL; } + if (retval->p.pool) + hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p); retval->merged_input_header = sam_hdr_read(retval->merged_input_file); if (retval->merged_input_header == NULL) { - fprintf(pysam_stderr, "Could not read header for file '%s'\n", - opts->merged_input_name); + print_error("split", "Could not read header from \"%s\"", opts->merged_input_name); cleanup_state(retval, false); return NULL; } @@ -347,14 +375,13 @@ static state_t* init(parsed_opts_t* opts) if (opts->unaccounted_header_name) { samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in); if (!hdr_load) { - fprintf(pysam_stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name); + print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name); cleanup_state(retval, false); return NULL; } retval->unaccounted_header = sam_hdr_read(hdr_load); if (retval->unaccounted_header == NULL) { - fprintf(pysam_stderr, "Could not read header for file '%s'\n", - opts->unaccounted_header_name); + print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name); cleanup_state(retval, false); return NULL; } @@ -365,10 +392,12 @@ static state_t* init(parsed_opts_t* opts) retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); if (retval->unaccounted_file == NULL) { - fprintf(pysam_stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name); + print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name); cleanup_state(retval, false); return NULL; } + if (retval->p.pool) + hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p); } // Open output files for RGs @@ -380,7 +409,7 @@ static state_t* init(parsed_opts_t* opts) retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); retval->rg_hash = kh_init_c2i(); if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { - fprintf(pysam_stderr, "Could not allocate memory for output file array. Out of memory?"); + print_error_errno("split", "Could not initialise output file array"); cleanup_state(retval, false); return NULL; } @@ -388,7 +417,7 @@ static state_t* init(parsed_opts_t* opts) char* dirsep = strrchr(opts->merged_input_name, '/'); char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); if (!input_base_name) { - fprintf(pysam_stderr, "Out of memory\n"); + print_error_errno("split", "Filename manipulation failed"); cleanup_state(retval, false); return NULL; } @@ -405,7 +434,7 @@ static state_t* init(parsed_opts_t* opts) &opts->ga.out); if ( output_filename == NULL ) { - fprintf(pysam_stderr, "Error expanding output filename format string.\n"); + print_error("split", "Error expanding output filename format string"); cleanup_state(retval, false); free(input_base_name); return NULL; @@ -414,11 +443,13 @@ static state_t* init(parsed_opts_t* opts) retval->rg_output_file_name[i] = output_filename; retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out); if (retval->rg_output_file[i] == NULL) { - fprintf(pysam_stderr, "Could not open output file: %s\n", output_filename); + print_error_errno("split", "Could not open \"%s\"", output_filename); cleanup_state(retval, false); free(input_base_name); return NULL; } + if (retval->p.pool) + hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p); // Record index in hash int ret; @@ -427,8 +458,8 @@ static state_t* init(parsed_opts_t* opts) // Set and edit header retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); - if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) { - fprintf(pysam_stderr, "Could not rewrite header for file: %s\n", output_filename); + if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) { + print_error("split", "Could not rewrite header for \"%s\"", output_filename); cleanup_state(retval, false); free(input_base_name); return NULL; @@ -443,14 +474,13 @@ static state_t* init(parsed_opts_t* opts) static bool split(state_t* state) { if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { - fprintf(pysam_stderr, "Could not write output file header\n"); + print_error_errno("split", "Could not write output file header"); return false; } size_t i; for (i = 0; i < state->output_count; i++) { if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { - fprintf(pysam_stderr, "Could not write output file header for '%s'\n", - state->rg_output_file_name[i]); + print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); return false; } } @@ -463,7 +493,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(pysam_stderr, "Could not read first input record\n"); + print_error("split", "Could not read first input record"); return false; } } @@ -484,8 +514,7 @@ static bool split(state_t* state) // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) { - fprintf(pysam_stderr, "Could not write to output file '%s'\n", - state->rg_output_file_name[i]); + print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]); bam_destroy1(file_read); return false; } @@ -501,7 +530,7 @@ static bool split(state_t* state) return false; } else { if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) { - fprintf(pysam_stderr, "Could not write to unaccounted output file\n"); + print_error_errno("split", "Could not write to unaccounted output file"); bam_destroy1(file_read); return false; } @@ -514,7 +543,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(pysam_stderr, "Could not read input record\n"); + print_error("split", "Could not read input record"); return false; } } @@ -531,7 +560,7 @@ static int cleanup_state(state_t* status, bool check_close) if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); if (status->unaccounted_file) { if (sam_close(status->unaccounted_file) < 0 && check_close) { - fprintf(pysam_stderr, "Error on closing unaccounted file\n"); + print_error("split", "Error on closing unaccounted file"); ret = -1; } } @@ -542,8 +571,7 @@ static int cleanup_state(state_t* status, bool check_close) bam_hdr_destroy(status->rg_output_header[i]); if (status->rg_output_file && status->rg_output_file[i]) { if (sam_close(status->rg_output_file[i]) < 0 && check_close) { - fprintf(pysam_stderr, "Error on closing output file '%s'\n", - status->rg_output_file_name[i]); + print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); ret = -1; } } @@ -559,6 +587,9 @@ static int cleanup_state(state_t* status, bool check_close) free(status->rg_id); free(status); + if (status->p.pool) + hts_tpool_destroy(status->p.pool); + return ret; } @@ -576,9 +607,10 @@ static void cleanup_opts(parsed_opts_t* opts) int main_split(int argc, char** argv) { int ret = 1; + char *arg_list = stringify_argv(argc+1, argv-1); parsed_opts_t* opts = parse_args(argc, argv); if (!opts) goto cleanup_opts; - state_t* status = init(opts); + state_t* status = init(opts, arg_list); if (!status) goto cleanup_opts; if (!split(status)) { @@ -590,6 +622,7 @@ int main_split(int argc, char** argv) cleanup_opts: cleanup_opts(opts); + free(arg_list); return ret; } diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c index f6cf1d5..aa5f8d3 100644 --- a/samtools/bam_stat.c +++ b/samtools/bam_stat.c @@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "samtools.h" +#include "sam_opts.h" typedef struct { long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; @@ -94,7 +95,8 @@ static const char *percent(char *buffer, long long n, long long total) static void usage_exit(FILE *fp, int exit_status) { - fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] \n"); + fprintf(fp, "Usage: samtools flagstat [options] \n"); + sam_global_opt_help(fp, "-.---@"); exit(exit_status); } @@ -104,25 +106,23 @@ int bam_flagstat(int argc, char *argv[]) bam_hdr_t *header; bam_flagstat_t *s; char b0[16], b1[16]; - hts_opt *in_opts = NULL; int c; enum { INPUT_FMT_OPTION = CHAR_MAX+1, }; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION}, + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { switch (c) { - case INPUT_FMT_OPTION: - if (hts_opt_add(&in_opts, optarg) < 0) - usage_exit(stderr, EXIT_FAILURE); - break; - default: + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage_exit(stderr, EXIT_FAILURE); } } @@ -131,15 +131,13 @@ int bam_flagstat(int argc, char *argv[]) if (argc == optind) usage_exit(stdout, EXIT_SUCCESS); else usage_exit(stderr, EXIT_FAILURE); } - fp = sam_open(argv[optind], "r"); + fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]); return 1; } - if (hts_opt_apply(fp, in_opts)) { - fprintf(stderr, "Failed to apply input-fmt-options\n"); - return 1; - } + if (ga.nthreads > 0) + hts_set_threads(fp, ga.nthreads); if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { @@ -174,6 +172,6 @@ int bam_flagstat(int argc, char *argv[]) free(s); bam_hdr_destroy(header); sam_close(fp); - hts_opt_free(in_opts); + sam_global_args_free(&ga); return 0; } diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c index cdca4dd..bbfe602 100644 --- a/samtools/bam_stat.c.pysam.c +++ b/samtools/bam_stat.c.pysam.c @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "samtools.h" +#include "sam_opts.h" typedef struct { long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; @@ -94,10 +95,11 @@ static const char *percent(char *buffer, long long n, long long total) return buffer; } -static void usage_exit(FILE *fp, int exit_status) +static int usage_exit(FILE *fp, int exit_status) { - fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] \n"); - exit(exit_status); + fprintf(fp, "Usage: samtools flagstat [options] \n"); + sam_global_opt_help(fp, "-.---@"); + return(exit_status); } int bam_flagstat(int argc, char *argv[]) @@ -106,42 +108,38 @@ int bam_flagstat(int argc, char *argv[]) bam_hdr_t *header; bam_flagstat_t *s; char b0[16], b1[16]; - hts_opt *in_opts = NULL; int c; enum { INPUT_FMT_OPTION = CHAR_MAX+1, }; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION}, + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { switch (c) { - case INPUT_FMT_OPTION: - if (hts_opt_add(&in_opts, optarg) < 0) - usage_exit(pysam_stderr, EXIT_FAILURE); - break; - default: - usage_exit(pysam_stderr, EXIT_FAILURE); + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + return(usage_exit(pysam_stderr, EXIT_FAILURE)); } } if (argc != optind+1) { - if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS); - else usage_exit(pysam_stderr, EXIT_FAILURE); + if (argc == optind) return(usage_exit(pysam_stdout, EXIT_SUCCESS)); + else return(usage_exit(pysam_stderr, EXIT_FAILURE)); } - fp = sam_open(argv[optind], "r"); + fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]); return 1; } - if (hts_opt_apply(fp, in_opts)) { - fprintf(pysam_stderr, "Failed to apply input-fmt-options\n"); - return 1; - } + if (ga.nthreads > 0) + hts_set_threads(fp, ga.nthreads); if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { @@ -176,6 +174,6 @@ int bam_flagstat(int argc, char *argv[]) free(s); bam_hdr_destroy(header); sam_close(fp); - hts_opt_free(in_opts); + sam_global_args_free(&ga); return 0; } diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c deleted file mode 100644 index f1f0cc7..0000000 --- a/samtools/bam_tview.c +++ /dev/null @@ -1,441 +0,0 @@ -/* bam_tview.c -- tview subcommand. - - Copyright (C) 2008-2015 Genome Research Ltd. - Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include "bam_tview.h" -#include -#include -#include -#include "sam_opts.h" - -khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample) -{ - khash_t(kh_rg)* rg_hash = kh_init(kh_rg); - // given sample id return all the RD ID's - const char rg_regex[] = "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)"; - - regex_t rg_id; - regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t)); - if (matches == NULL) { perror("out of memory"); exit(-1); } - regcomp(&rg_id, rg_regex, REG_EXTENDED|REG_NEWLINE); - char* text = strdup(header); - char* end = text + strlen(header); - char* tofree = text; - while (end > text && regexec(&rg_id, text, 2, matches, 0) == 0) { // foreach rg id in header - int ret; - text[matches[1].rm_eo] = '\0'; - kh_put(kh_rg, rg_hash, strdup(text+matches[1].rm_so), &ret); // Add the RG to the list - text += matches[0].rm_eo + 1; // Move search pointer forward - } - free(tofree); - return rg_hash; -} - -int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt) -{ - assert(tv!=NULL); - assert(fn!=NULL); - tv->mrow = 24; tv->mcol = 80; - tv->color_for = TV_COLOR_MAPQ; - tv->is_dot = 1; - - tv->fp = sam_open_format(fn, "r", fmt); - if(tv->fp == NULL) - { - fprintf(stderr,"sam_open %s. %s\n", fn,fn_fa); - exit(EXIT_FAILURE); - } - // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024); - assert(tv->fp); - - tv->header = sam_hdr_read(tv->fp); - if(tv->header == NULL) - { - fprintf(stderr,"Cannot read '%s'.\n", fn); - exit(EXIT_FAILURE); - } - tv->idx = sam_index_load(tv->fp, fn); - if (tv->idx == NULL) - { - fprintf(stderr,"Cannot read index for '%s'.\n", fn); - exit(EXIT_FAILURE); - } - tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); - if (fn_fa) tv->fai = fai_load(fn_fa); - tv->bca = bcf_call_init(0.83, 13); - tv->ins = 1; - - // If the user has asked for specific samples find out create a list of readgroups make up these samples - if ( samples ) - { - tv->rg_hash = get_rg_sample(tv->header->text, samples); // Init the list of rg's - } - - return 0; -} - - -void base_tv_destroy(tview_t* tv) -{ - bam_lplbuf_destroy(tv->lplbuf); - bcf_call_destroy(tv->bca); - hts_idx_destroy(tv->idx); - if (tv->fai) fai_destroy(tv->fai); - free(tv->ref); - bam_hdr_destroy(tv->header); - sam_close(tv->fp); -} - - -int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) -{ - tview_t *tv = (tview_t*)data; - int i, j, c, rb, attr, max_ins = 0; - uint32_t call = 0; - if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen - // print reference - rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; - for (i = tv->last_pos + 1; i < pos; ++i) { - if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1); - c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; - tv->my_mvaddch(tv,1, tv->ccol++, c); - } - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); - { // call consensus - bcf_callret1_t bcr; - memset(&bcr, 0, sizeof bcr); - int qsum[4], a1, a2, tmp; - double p[3], prior = 30; - bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr); - for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i; - for (i = 1; i < 4; ++i) // insertion sort - for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j) - tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; - a1 = qsum[0]&3; a2 = qsum[1]&3; - p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2]; - if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3; - if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3; - if (p[0] < p[1] && p[0] < p[2]) call = (1<my_underline(tv); - c = ",ACMGRSVTWYHKDBN"[call>>16&0xf]; - i = (call&0xffff)/10+1; - if (i > 4) i = 4; - attr |= tv->my_colorpair(tv,i); - if (c == toupper(rb)) c = '.'; - tv->my_attron(tv,attr); - tv->my_mvaddch(tv,2, tv->ccol, c); - tv->my_attroff(tv,attr); - if(tv->ins) { - // calculate maximum insert - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; - } - } - // core loop - for (j = 0; j <= max_ins; ++j) { - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - int row = TV_MIN_ALNROW + p->level - tv->row_shift; - if (j == 0) { - if (!p->is_del) { - if (tv->base_for == TV_BASE_COLOR_SPACE && - (c = bam_aux_getCSi(p->b, p->qpos))) { - // assume that if we found one color, we will be able to get the color error - if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.'; - } else { - if (tv->show_name) { - char *name = bam_get_qname(p->b); - c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos]; - } else { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]; - if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.'; - } - } - } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*'; - } else { // padding - if (j > p->indel) c = '*'; - else { // insertion - if (tv->base_for == TV_BASE_NUCL) { - if (tv->show_name) { - char *name = bam_get_qname(p->b); - c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j]; - } else { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; - if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.'; - } - } else { - c = bam_aux_getCSi(p->b, p->qpos + j); - if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.'; - } - } - } - if (row > TV_MIN_ALNROW && row < tv->mrow) { - int x; - attr = 0; - if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) - || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv); - if (tv->color_for == TV_COLOR_BASEQ) { - x = bam_get_qual(p->b)[p->qpos]/10 + 1; - if (x > 4) x = 4; - attr |= tv->my_colorpair(tv,x); - } else if (tv->color_for == TV_COLOR_MAPQ) { - x = p->b->core.qual/10 + 1; - if (x > 4) x = 4; - attr |= tv->my_colorpair(tv,x); - } else if (tv->color_for == TV_COLOR_NUCL) { - x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5; - attr |= tv->my_colorpair(tv,x); - } else if(tv->color_for == TV_COLOR_COL) { - x = 0; - switch(bam_aux_getCSi(p->b, p->qpos)) { - case '0': x = 0; break; - case '1': x = 1; break; - case '2': x = 2; break; - case '3': x = 3; break; - case '4': x = 4; break; - default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break; - } - x+=5; - attr |= tv->my_colorpair(tv,x); - } else if(tv->color_for == TV_COLOR_COLQ) { - x = bam_aux_getCQi(p->b, p->qpos); - if(0 == x) x = bam_get_qual(p->b)[p->qpos]; - x = x/10 + 1; - if (x > 4) x = 4; - attr |= tv->my_colorpair(tv,x); - } - tv->my_attron(tv,attr); - tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c)); - tv->my_attroff(tv,attr); - } - } - c = j? '*' : rb; - if (c == '*') { - attr = tv->my_colorpair(tv,8); - tv->my_attron(tv,attr); - tv->my_mvaddch(tv,1, tv->ccol++, c); - tv->my_attroff(tv,attr); - } else tv->my_mvaddch(tv,1, tv->ccol++, c); - } - tv->last_pos = pos; - return 0; -} - - - - -static int tv_push_aln(const bam1_t *b, tview_t *tv) -{ - /* If we are restricted to specific readgroups check RG is in the list */ - if ( tv->rg_hash ) - { - const uint8_t *rg = bam_aux_get(b, "RG"); - if ( !rg ) return 0; // If we don't have an RG tag exclude read - khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1)); - if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read - } - if (tv->no_skip) { - uint32_t *cigar = bam_get_cigar(b); // this is cheating... - int i; - for (i = 0; i core.n_cigar; ++i) { - if ((cigar[i]&0xf) == BAM_CREF_SKIP) - cigar[i] = cigar[i]>>4<<4 | BAM_CDEL; - } - } - bam_lplbuf_push(b, tv->lplbuf); - return 0; -} - -int base_draw_aln(tview_t *tv, int tid, int pos) -{ - assert(tv!=NULL); - // reset - tv->my_clear(tv); - tv->curr_tid = tid; tv->left_pos = pos; - tv->last_pos = tv->left_pos - 1; - tv->ccol = 0; - // print ref and consensus - if (tv->fai) { - char *str; - if (tv->ref) free(tv->ref); - assert(tv->curr_tid>=0); - - str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); - assert(str!=NULL); - sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); - tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); - free(str); - if ( !tv->ref ) - { - fprintf(stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n"); - exit(1); - } - } - // draw aln - bam_lplbuf_reset(tv->lplbuf); - hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol); - bam1_t *b = bam_init1(); - while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv); - bam_destroy1(b); - hts_itr_destroy(iter); - bam_lplbuf_push(0, tv->lplbuf); - - while (tv->ccol < tv->mcol) { - int pos = tv->last_pos + 1; - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); - tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); - ++tv->last_pos; - } - return 0; -} - - - - -static void error(const char *format, ...) -{ - if ( !format ) - { - fprintf(stderr, -"Usage: samtools tview [options] [ref.fasta]\n" -"Options:\n" -" -d display output as (H)tml or (C)urses or (T)ext \n" -" -p chr:pos go directly to this position\n" -" -s STR display only reads from this sample or group\n"); - sam_global_opt_help(stderr, "-.--."); - } - else - { - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); - } - exit(-1); -} - -enum dipsay_mode {display_ncurses,display_html,display_text}; -extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); -extern tview_t* html_tv_init(const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); -extern tview_t* text_tv_init(const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); - -int bam_tview_main(int argc, char *argv[]) -{ - int view_mode=display_ncurses; - tview_t* tv=NULL; - char *samples=NULL, *position=NULL, *ref; - int c; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), - { NULL, 0, NULL, 0 } - }; - - while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) { - switch (c) { - case 's': samples=optarg; break; - case 'p': position=optarg; break; - case 'd': - { - switch(optarg[0]) - { - case 'H': case 'h': view_mode=display_html;break; - case 'T': case 't': view_mode=display_text;break; - case 'C': case 'c': view_mode=display_ncurses;break; - default: view_mode=display_ncurses;break; - } - break; - } - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': error(NULL); - } - } - if (argc==optind) error(NULL); - - ref = (optind+1>=argc)? ga.reference : argv[optind+1]; - - switch(view_mode) - { - case display_ncurses: - tv = curses_tv_init(argv[optind], ref, samples, &ga.in); - break; - - case display_text: - tv = text_tv_init(argv[optind], ref, samples, &ga.in); - break; - - case display_html: - tv = html_tv_init(argv[optind], ref, samples, &ga.in); - break; - } - if (tv==NULL) - { - error("cannot create view"); - return EXIT_FAILURE; - } - - if ( position ) - { - int tid, beg, end; - char *name_lim = (char *) hts_parse_reg(position, &beg, &end); - if (name_lim) *name_lim = '\0'; - else beg = 0; // region parsing failed, but possibly a seq named "foo:a" - tid = bam_name2id(tv->header, position); - if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; } - } - else if ( tv->fai ) - { - // find the first sequence present in both BAM and the reference file - int i; - for (i=0; iheader->n_targets; i++) - { - if ( faidx_has_seq(tv->fai, tv->header->target_name[i]) ) break; - } - if ( i==tv->header->n_targets ) - { - fprintf(stderr,"None of the BAM sequence names present in the fasta file\n"); - exit(EXIT_FAILURE); - } - tv->curr_tid = i; - } - tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); - tv->my_loop(tv); - tv->my_destroy(tv); - - return EXIT_SUCCESS; -} diff --git a/samtools/bam_tview.c.pysam.c b/samtools/bam_tview.c.pysam.c deleted file mode 100644 index a47bced..0000000 --- a/samtools/bam_tview.c.pysam.c +++ /dev/null @@ -1,443 +0,0 @@ -#include "pysam.h" - -/* bam_tview.c -- tview subcommand. - - Copyright (C) 2008-2015 Genome Research Ltd. - Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include "bam_tview.h" -#include -#include -#include -#include "sam_opts.h" - -khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample) -{ - khash_t(kh_rg)* rg_hash = kh_init(kh_rg); - // given sample id return all the RD ID's - const char rg_regex[] = "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)"; - - regex_t rg_id; - regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t)); - if (matches == NULL) { perror("out of memory"); exit(-1); } - regcomp(&rg_id, rg_regex, REG_EXTENDED|REG_NEWLINE); - char* text = strdup(header); - char* end = text + strlen(header); - char* tofree = text; - while (end > text && regexec(&rg_id, text, 2, matches, 0) == 0) { // foreach rg id in header - int ret; - text[matches[1].rm_eo] = '\0'; - kh_put(kh_rg, rg_hash, strdup(text+matches[1].rm_so), &ret); // Add the RG to the list - text += matches[0].rm_eo + 1; // Move search pointer forward - } - free(tofree); - return rg_hash; -} - -int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt) -{ - assert(tv!=NULL); - assert(fn!=NULL); - tv->mrow = 24; tv->mcol = 80; - tv->color_for = TV_COLOR_MAPQ; - tv->is_dot = 1; - - tv->fp = sam_open_format(fn, "r", fmt); - if(tv->fp == NULL) - { - fprintf(pysam_stderr,"sam_open %s. %s\n", fn,fn_fa); - exit(EXIT_FAILURE); - } - // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024); - assert(tv->fp); - - tv->header = sam_hdr_read(tv->fp); - if(tv->header == NULL) - { - fprintf(pysam_stderr,"Cannot read '%s'.\n", fn); - exit(EXIT_FAILURE); - } - tv->idx = sam_index_load(tv->fp, fn); - if (tv->idx == NULL) - { - fprintf(pysam_stderr,"Cannot read index for '%s'.\n", fn); - exit(EXIT_FAILURE); - } - tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); - if (fn_fa) tv->fai = fai_load(fn_fa); - tv->bca = bcf_call_init(0.83, 13); - tv->ins = 1; - - // If the user has asked for specific samples find out create a list of readgroups make up these samples - if ( samples ) - { - tv->rg_hash = get_rg_sample(tv->header->text, samples); // Init the list of rg's - } - - return 0; -} - - -void base_tv_destroy(tview_t* tv) -{ - bam_lplbuf_destroy(tv->lplbuf); - bcf_call_destroy(tv->bca); - hts_idx_destroy(tv->idx); - if (tv->fai) fai_destroy(tv->fai); - free(tv->ref); - bam_hdr_destroy(tv->header); - sam_close(tv->fp); -} - - -int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) -{ - tview_t *tv = (tview_t*)data; - int i, j, c, rb, attr, max_ins = 0; - uint32_t call = 0; - if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen - // print reference - rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; - for (i = tv->last_pos + 1; i < pos; ++i) { - if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1); - c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; - tv->my_mvaddch(tv,1, tv->ccol++, c); - } - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); - { // call consensus - bcf_callret1_t bcr; - memset(&bcr, 0, sizeof bcr); - int qsum[4], a1, a2, tmp; - double p[3], prior = 30; - bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr); - for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i; - for (i = 1; i < 4; ++i) // insertion sort - for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j) - tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; - a1 = qsum[0]&3; a2 = qsum[1]&3; - p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2]; - if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3; - if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3; - if (p[0] < p[1] && p[0] < p[2]) call = (1<my_underline(tv); - c = ",ACMGRSVTWYHKDBN"[call>>16&0xf]; - i = (call&0xffff)/10+1; - if (i > 4) i = 4; - attr |= tv->my_colorpair(tv,i); - if (c == toupper(rb)) c = '.'; - tv->my_attron(tv,attr); - tv->my_mvaddch(tv,2, tv->ccol, c); - tv->my_attroff(tv,attr); - if(tv->ins) { - // calculate maximum insert - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; - } - } - // core loop - for (j = 0; j <= max_ins; ++j) { - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - int row = TV_MIN_ALNROW + p->level - tv->row_shift; - if (j == 0) { - if (!p->is_del) { - if (tv->base_for == TV_BASE_COLOR_SPACE && - (c = bam_aux_getCSi(p->b, p->qpos))) { - // assume that if we found one color, we will be able to get the color error - if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.'; - } else { - if (tv->show_name) { - char *name = bam_get_qname(p->b); - c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos]; - } else { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]; - if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.'; - } - } - } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*'; - } else { // padding - if (j > p->indel) c = '*'; - else { // insertion - if (tv->base_for == TV_BASE_NUCL) { - if (tv->show_name) { - char *name = bam_get_qname(p->b); - c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j]; - } else { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; - if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.'; - } - } else { - c = bam_aux_getCSi(p->b, p->qpos + j); - if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.'; - } - } - } - if (row > TV_MIN_ALNROW && row < tv->mrow) { - int x; - attr = 0; - if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) - || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv); - if (tv->color_for == TV_COLOR_BASEQ) { - x = bam_get_qual(p->b)[p->qpos]/10 + 1; - if (x > 4) x = 4; - attr |= tv->my_colorpair(tv,x); - } else if (tv->color_for == TV_COLOR_MAPQ) { - x = p->b->core.qual/10 + 1; - if (x > 4) x = 4; - attr |= tv->my_colorpair(tv,x); - } else if (tv->color_for == TV_COLOR_NUCL) { - x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5; - attr |= tv->my_colorpair(tv,x); - } else if(tv->color_for == TV_COLOR_COL) { - x = 0; - switch(bam_aux_getCSi(p->b, p->qpos)) { - case '0': x = 0; break; - case '1': x = 1; break; - case '2': x = 2; break; - case '3': x = 3; break; - case '4': x = 4; break; - default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break; - } - x+=5; - attr |= tv->my_colorpair(tv,x); - } else if(tv->color_for == TV_COLOR_COLQ) { - x = bam_aux_getCQi(p->b, p->qpos); - if(0 == x) x = bam_get_qual(p->b)[p->qpos]; - x = x/10 + 1; - if (x > 4) x = 4; - attr |= tv->my_colorpair(tv,x); - } - tv->my_attron(tv,attr); - tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c)); - tv->my_attroff(tv,attr); - } - } - c = j? '*' : rb; - if (c == '*') { - attr = tv->my_colorpair(tv,8); - tv->my_attron(tv,attr); - tv->my_mvaddch(tv,1, tv->ccol++, c); - tv->my_attroff(tv,attr); - } else tv->my_mvaddch(tv,1, tv->ccol++, c); - } - tv->last_pos = pos; - return 0; -} - - - - -static int tv_push_aln(const bam1_t *b, tview_t *tv) -{ - /* If we are restricted to specific readgroups check RG is in the list */ - if ( tv->rg_hash ) - { - const uint8_t *rg = bam_aux_get(b, "RG"); - if ( !rg ) return 0; // If we don't have an RG tag exclude read - khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1)); - if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read - } - if (tv->no_skip) { - uint32_t *cigar = bam_get_cigar(b); // this is cheating... - int i; - for (i = 0; i core.n_cigar; ++i) { - if ((cigar[i]&0xf) == BAM_CREF_SKIP) - cigar[i] = cigar[i]>>4<<4 | BAM_CDEL; - } - } - bam_lplbuf_push(b, tv->lplbuf); - return 0; -} - -int base_draw_aln(tview_t *tv, int tid, int pos) -{ - assert(tv!=NULL); - // reset - tv->my_clear(tv); - tv->curr_tid = tid; tv->left_pos = pos; - tv->last_pos = tv->left_pos - 1; - tv->ccol = 0; - // print ref and consensus - if (tv->fai) { - char *str; - if (tv->ref) free(tv->ref); - assert(tv->curr_tid>=0); - - str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); - assert(str!=NULL); - sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); - tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); - free(str); - if ( !tv->ref ) - { - fprintf(pysam_stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n"); - exit(1); - } - } - // draw aln - bam_lplbuf_reset(tv->lplbuf); - hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol); - bam1_t *b = bam_init1(); - while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv); - bam_destroy1(b); - hts_itr_destroy(iter); - bam_lplbuf_push(0, tv->lplbuf); - - while (tv->ccol < tv->mcol) { - int pos = tv->last_pos + 1; - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); - tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); - ++tv->last_pos; - } - return 0; -} - - - - -static void error(const char *format, ...) -{ - if ( !format ) - { - fprintf(pysam_stderr, -"Usage: samtools tview [options] [ref.fasta]\n" -"Options:\n" -" -d display output as (H)tml or (C)urses or (T)ext \n" -" -p chr:pos go directly to this position\n" -" -s STR display only reads from this sample or group\n"); - sam_global_opt_help(pysam_stderr, "-.--."); - } - else - { - va_list ap; - va_start(ap, format); - vfprintf(pysam_stderr, format, ap); - va_end(ap); - } - exit(-1); -} - -enum dipsay_mode {display_ncurses,display_html,display_text}; -extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); -extern tview_t* html_tv_init(const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); -extern tview_t* text_tv_init(const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); - -int bam_tview_main(int argc, char *argv[]) -{ - int view_mode=display_ncurses; - tview_t* tv=NULL; - char *samples=NULL, *position=NULL, *ref; - int c; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), - { NULL, 0, NULL, 0 } - }; - - while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) { - switch (c) { - case 's': samples=optarg; break; - case 'p': position=optarg; break; - case 'd': - { - switch(optarg[0]) - { - case 'H': case 'h': view_mode=display_html;break; - case 'T': case 't': view_mode=display_text;break; - case 'C': case 'c': view_mode=display_ncurses;break; - default: view_mode=display_ncurses;break; - } - break; - } - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': error(NULL); - } - } - if (argc==optind) error(NULL); - - ref = (optind+1>=argc)? ga.reference : argv[optind+1]; - - switch(view_mode) - { - case display_ncurses: - tv = curses_tv_init(argv[optind], ref, samples, &ga.in); - break; - - case display_text: - tv = text_tv_init(argv[optind], ref, samples, &ga.in); - break; - - case display_html: - tv = html_tv_init(argv[optind], ref, samples, &ga.in); - break; - } - if (tv==NULL) - { - error("cannot create view"); - return EXIT_FAILURE; - } - - if ( position ) - { - int tid, beg, end; - char *name_lim = (char *) hts_parse_reg(position, &beg, &end); - if (name_lim) *name_lim = '\0'; - else beg = 0; // region parsing failed, but possibly a seq named "foo:a" - tid = bam_name2id(tv->header, position); - if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; } - } - else if ( tv->fai ) - { - // find the first sequence present in both BAM and the reference file - int i; - for (i=0; iheader->n_targets; i++) - { - if ( faidx_has_seq(tv->fai, tv->header->target_name[i]) ) break; - } - if ( i==tv->header->n_targets ) - { - fprintf(pysam_stderr,"None of the BAM sequence names present in the fasta file\n"); - exit(EXIT_FAILURE); - } - tv->curr_tid = i; - } - tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); - tv->my_loop(tv); - tv->my_destroy(tv); - - return EXIT_SUCCESS; -} diff --git a/samtools/bam_tview.h b/samtools/bam_tview.h deleted file mode 100644 index e11e39d..0000000 --- a/samtools/bam_tview.h +++ /dev/null @@ -1,105 +0,0 @@ -/* bam_tview.h -- tview subcommand. - - Copyright (C) 2008, 2013 Genome Research Ltd. - Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef BAM_TVIEW_H -#define BAM_TVIEW_H - -#include -#include -#include -#include -#include -#include -#include "bam2bcf.h" -#include -#include -#include -#include "bam_lpileup.h" - - -KHASH_MAP_INIT_STR(kh_rg, const char *) - -/* Holds state of Tview */ -typedef struct AbstractTview { - int mrow, mcol; - - hts_idx_t* idx; - bam_lplbuf_t* lplbuf; - bam_hdr_t* header; - samFile* fp; - int curr_tid, left_pos; - faidx_t* fai; - bcf_callaux_t* bca; - - int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins; - int no_skip, show_name, inverse; - char *ref; - /* maps @RG ID => SM (sample), in practice only used to determine whether a particular RG is in the list of allowed ones */ - khash_t(kh_rg) *rg_hash; - /* callbacks */ - void (*my_destroy)(struct AbstractTview* ); - void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); - void (*my_mvaddch)(struct AbstractTview*,int,int,int); - void (*my_attron)(struct AbstractTview*,int); - void (*my_attroff)(struct AbstractTview*,int); - void (*my_clear)(struct AbstractTview*); - int (*my_colorpair)(struct AbstractTview*,int); - int (*my_drawaln)(struct AbstractTview*,int,int); - int (*my_loop)(struct AbstractTview*); - int (*my_underline)(struct AbstractTview*); -} tview_t; - - -char bam_aux_getCEi(bam1_t *b, int i); -char bam_aux_getCSi(bam1_t *b, int i); -char bam_aux_getCQi(bam1_t *b, int i); - -#define TV_MIN_ALNROW 2 -#define TV_MAX_GOTO 40 -#define TV_LOW_MAPQ 10 - -#define TV_COLOR_MAPQ 0 -#define TV_COLOR_BASEQ 1 -#define TV_COLOR_NUCL 2 -#define TV_COLOR_COL 3 -#define TV_COLOR_COLQ 4 - -#define TV_BASE_NUCL 0 -#define TV_BASE_COLOR_SPACE 1 - -int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); -int base_tv_init(tview_t*,const char *fn, const char *fn_fa, - const char *samples, const htsFormat *fmt); -void base_tv_destroy(tview_t*); -int base_draw_aln(tview_t *tv, int tid, int pos); - -typedef struct Tixel - { - int ch; - int attributes; - }tixel_t; - -#endif - diff --git a/samtools/bam_tview_curses.c b/samtools/bam_tview_curses.c deleted file mode 100644 index d7edfe8..0000000 --- a/samtools/bam_tview_curses.c +++ /dev/null @@ -1,352 +0,0 @@ -/* bam_tview_curses.c -- curses tview implementation. - - Copyright (C) 2008-2013 Genome Research Ltd. - Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include "bam_tview.h" - -#ifdef HAVE_CURSES - -#if defined HAVE_NCURSESW_CURSES_H -#include -#elif defined HAVE_NCURSESW_H -#include -#elif defined HAVE_NCURSES_CURSES_H -#include -#elif defined HAVE_NCURSES_H -#include -#elif defined HAVE_CURSES_H -#include -#endif - -typedef struct CursesTview { - tview_t view; - WINDOW *wgoto, *whelp; - } curses_tview_t; - -#define FROM_TV(ptr) ((curses_tview_t*)ptr) - -static void curses_destroy(tview_t* base) - { - curses_tview_t* tv=(curses_tview_t*)base; - - - delwin(tv->wgoto); delwin(tv->whelp); - endwin(); - - base_tv_destroy(base); - - free(tv); - } - -/* - void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); - void (*my_)(struct AbstractTview*,int,int,int); - void (*my_attron)(struct AbstractTview*,int); - void (*my_attroff)(struct AbstractTview*,int); - void (*my_clear)(struct AbstractTview*); - int (*my_colorpair)(struct AbstractTview*,int); -*/ - -static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) - { - unsigned int size=tv->mcol+2; - char* str=malloc(size); - if(str==0) exit(EXIT_FAILURE); - va_list argptr; - va_start(argptr, fmt); - vsnprintf(str,size, fmt, argptr); - va_end(argptr); - mvprintw(y,x,str); - free(str); - } - -static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch) - { - mvaddch(y,x,ch); - } - -static void curses_attron(struct AbstractTview* tv,int flag) - { - attron(flag); - } -static void curses_attroff(struct AbstractTview* tv,int flag) - { - attroff(flag); - } -static void curses_clear(struct AbstractTview* tv) - { - clear(); - } - -static int curses_init_colors(int inverse) -{ - if (inverse) { - init_pair(1, COLOR_WHITE, COLOR_BLUE); - init_pair(2, COLOR_BLACK, COLOR_GREEN); - init_pair(3, COLOR_BLACK, COLOR_YELLOW); - init_pair(4, COLOR_BLACK, COLOR_WHITE); - init_pair(5, COLOR_BLACK, COLOR_GREEN); - init_pair(6, COLOR_BLACK, COLOR_CYAN); - init_pair(7, COLOR_WHITE, COLOR_MAGENTA); - init_pair(8, COLOR_WHITE, COLOR_RED); - init_pair(9, COLOR_WHITE, COLOR_BLUE); - } else { - init_pair(1, COLOR_BLUE, COLOR_BLACK); - init_pair(2, COLOR_GREEN, COLOR_BLACK); - init_pair(3, COLOR_YELLOW, COLOR_BLACK); - init_pair(4, COLOR_WHITE, COLOR_BLACK); - init_pair(5, COLOR_GREEN, COLOR_BLACK); - init_pair(6, COLOR_CYAN, COLOR_BLACK); - init_pair(7, COLOR_MAGENTA, COLOR_BLACK); - init_pair(8, COLOR_RED, COLOR_BLACK); - init_pair(9, COLOR_BLUE, COLOR_BLACK); - } - - return 0; -} - -static int curses_colorpair(struct AbstractTview* tv,int flag) - { - return COLOR_PAIR(flag); - } - -static int curses_drawaln(struct AbstractTview* tv, int tid, int pos) - { - return base_draw_aln(tv, tid, pos); - } - - - -static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos) - { - char str[256], *p; - int i, l = 0; - tview_t *base=(tview_t*)tv; - wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(tv->wgoto, 1, 2, "Goto: "); - for (;;) { - int invalid = 0; - int c = wgetch(tv->wgoto); - wrefresh(tv->wgoto); - if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { - if(l > 0) --l; - } else if (c == KEY_ENTER || c == '\012' || c == '\015') { - int _tid = -1, _beg, _end; - if (str[0] == '=') { - _beg = strtol(str+1, &p, 10) - 1; - if (_beg > 0) { - *pos = _beg; - return; - } - } else { - char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end); - if (name_lim) { - char name_terminator = *name_lim; - *name_lim = '\0'; - _tid = bam_name2id(base->header, str); - *name_lim = name_terminator; - } - else { - // Unparsable region, but possibly a sequence named "foo:a" - _tid = bam_name2id(base->header, str); - _beg = 0; - } - - if (_tid >= 0) { - *tid = _tid; *pos = _beg; - return; - } - } - - // If we get here, the region string is invalid - invalid = 1; - } else if (isgraph(c)) { - if (l < TV_MAX_GOTO) str[l++] = c; - } else if (c == '\027') l = 0; - else if (c == '\033') return; - str[l] = '\0'; - for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); - if (invalid) mvwprintw(tv->wgoto, 1, TV_MAX_GOTO - 1, "[Invalid]"); - mvwprintw(tv->wgoto, 1, 8, "%s", str); - } -} - - - - -static void tv_win_help(curses_tview_t *tv) { - int r = 1; - tview_t* base=(tview_t*)base; - WINDOW *win = tv->whelp; - wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(win, r++, 2, " -=- Help -=- "); - r++; - mvwprintw(win, r++, 2, "? This window"); - mvwprintw(win, r++, 2, "Arrows Small scroll movement"); - mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); - mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); - mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); - mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); - mvwprintw(win, r++, 2, "space Scroll one screen"); - mvwprintw(win, r++, 2, "backspace Scroll back one screen"); - mvwprintw(win, r++, 2, "g Go to specific location"); - mvwprintw(win, r++, 2, "m Color for mapping qual"); - mvwprintw(win, r++, 2, "n Color for nucleotide"); - mvwprintw(win, r++, 2, "b Color for base quality"); - mvwprintw(win, r++, 2, "c Color for cs color"); - mvwprintw(win, r++, 2, "z Color for cs qual"); - mvwprintw(win, r++, 2, ". Toggle on/off dot view"); - mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); - mvwprintw(win, r++, 2, "r Toggle on/off rd name"); - mvwprintw(win, r++, 2, "N Turn on nt view"); - mvwprintw(win, r++, 2, "C Turn on cs view"); - mvwprintw(win, r++, 2, "i Toggle on/off ins"); - mvwprintw(win, r++, 2, "v Inverse video"); - mvwprintw(win, r++, 2, "q Exit"); - r++; - mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); - mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); - mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); - wrefresh(win); - wgetch(win); -} - -static int curses_underline(tview_t* tv) - { - return A_UNDERLINE; - } - -static int curses_loop(tview_t* tv) - { - int tid, pos; - curses_tview_t *CTV=(curses_tview_t *)tv; - tid = tv->curr_tid; pos = tv->left_pos; - while (1) { - int c = getch(); - switch (c) { - case '?': tv_win_help(CTV); break; - case '\033': - case 'q': goto end_loop; - case '/': - case 'g': tv_win_goto(CTV, &tid, &pos); break; - case 'm': tv->color_for = TV_COLOR_MAPQ; break; - case 'b': tv->color_for = TV_COLOR_BASEQ; break; - case 'n': tv->color_for = TV_COLOR_NUCL; break; - case 'c': tv->color_for = TV_COLOR_COL; break; - case 'z': tv->color_for = TV_COLOR_COLQ; break; - case 'v': curses_init_colors(tv->inverse = !tv->inverse); break; - case 's': tv->no_skip = !tv->no_skip; break; - case 'r': tv->show_name = !tv->show_name; break; - case KEY_LEFT: - case 'h': --pos; break; - case KEY_RIGHT: - case 'l': ++pos; break; - case KEY_SLEFT: - case 'H': pos -= 20; break; - case KEY_SRIGHT: - case 'L': pos += 20; break; - case '.': tv->is_dot = !tv->is_dot; break; - case 'N': tv->base_for = TV_BASE_NUCL; break; - case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; - case 'i': tv->ins = !tv->ins; break; - case '\010': pos -= 1000; break; - case '\014': pos += 1000; break; - case ' ': pos += tv->mcol; break; - case KEY_UP: - case 'j': --tv->row_shift; break; - case KEY_DOWN: - case 'k': ++tv->row_shift; break; - case KEY_BACKSPACE: - case '\177': pos -= tv->mcol; break; - case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; - default: continue; - } - if (pos < 0) pos = 0; - if (tv->row_shift < 0) tv->row_shift = 0; - tv->my_drawaln(tv, tid, pos); - } -end_loop: - return 0; -} - - - - -tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t)); - tview_t* base=(tview_t*)tv; - if(tv==0) - { - fprintf(stderr,"Calloc failed\n"); - return 0; - } - - base_tv_init(base,fn,fn_fa,samples,fmt); - /* initialize callbacks */ -#define SET_CALLBACK(fun) base->my_##fun=curses_##fun; - SET_CALLBACK(destroy); - SET_CALLBACK(mvprintw); - SET_CALLBACK(mvaddch); - SET_CALLBACK(attron); - SET_CALLBACK(attroff); - SET_CALLBACK(clear); - SET_CALLBACK(colorpair); - SET_CALLBACK(drawaln); - SET_CALLBACK(loop); - SET_CALLBACK(underline); -#undef SET_CALLBACK - - initscr(); - keypad(stdscr, TRUE); - clear(); - noecho(); - cbreak(); - - getmaxyx(stdscr, base->mrow, base->mcol); - tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); - tv->whelp = newwin(30, 40, 5, 5); - - start_color(); - curses_init_colors(0); - return base; - } - -#else // !HAVE_CURSES - -#warning "No curses library is available; tview with curses is disabled." - -extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt); - -tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - return text_tv_init(fn,fn_fa,samples,fmt); - } - -#endif diff --git a/samtools/bam_tview_curses.c.pysam.c b/samtools/bam_tview_curses.c.pysam.c deleted file mode 100644 index 90a8335..0000000 --- a/samtools/bam_tview_curses.c.pysam.c +++ /dev/null @@ -1,354 +0,0 @@ -#include "pysam.h" - -/* bam_tview_curses.c -- curses tview implementation. - - Copyright (C) 2008-2013 Genome Research Ltd. - Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include "bam_tview.h" - -#ifdef HAVE_CURSES - -#if defined HAVE_NCURSESW_CURSES_H -#include -#elif defined HAVE_NCURSESW_H -#include -#elif defined HAVE_NCURSES_CURSES_H -#include -#elif defined HAVE_NCURSES_H -#include -#elif defined HAVE_CURSES_H -#include -#endif - -typedef struct CursesTview { - tview_t view; - WINDOW *wgoto, *whelp; - } curses_tview_t; - -#define FROM_TV(ptr) ((curses_tview_t*)ptr) - -static void curses_destroy(tview_t* base) - { - curses_tview_t* tv=(curses_tview_t*)base; - - - delwin(tv->wgoto); delwin(tv->whelp); - endwin(); - - base_tv_destroy(base); - - free(tv); - } - -/* - void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); - void (*my_)(struct AbstractTview*,int,int,int); - void (*my_attron)(struct AbstractTview*,int); - void (*my_attroff)(struct AbstractTview*,int); - void (*my_clear)(struct AbstractTview*); - int (*my_colorpair)(struct AbstractTview*,int); -*/ - -static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) - { - unsigned int size=tv->mcol+2; - char* str=malloc(size); - if(str==0) exit(EXIT_FAILURE); - va_list argptr; - va_start(argptr, fmt); - vsnprintf(str,size, fmt, argptr); - va_end(argptr); - mvprintw(y,x,str); - free(str); - } - -static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch) - { - mvaddch(y,x,ch); - } - -static void curses_attron(struct AbstractTview* tv,int flag) - { - attron(flag); - } -static void curses_attroff(struct AbstractTview* tv,int flag) - { - attroff(flag); - } -static void curses_clear(struct AbstractTview* tv) - { - clear(); - } - -static int curses_init_colors(int inverse) -{ - if (inverse) { - init_pair(1, COLOR_WHITE, COLOR_BLUE); - init_pair(2, COLOR_BLACK, COLOR_GREEN); - init_pair(3, COLOR_BLACK, COLOR_YELLOW); - init_pair(4, COLOR_BLACK, COLOR_WHITE); - init_pair(5, COLOR_BLACK, COLOR_GREEN); - init_pair(6, COLOR_BLACK, COLOR_CYAN); - init_pair(7, COLOR_WHITE, COLOR_MAGENTA); - init_pair(8, COLOR_WHITE, COLOR_RED); - init_pair(9, COLOR_WHITE, COLOR_BLUE); - } else { - init_pair(1, COLOR_BLUE, COLOR_BLACK); - init_pair(2, COLOR_GREEN, COLOR_BLACK); - init_pair(3, COLOR_YELLOW, COLOR_BLACK); - init_pair(4, COLOR_WHITE, COLOR_BLACK); - init_pair(5, COLOR_GREEN, COLOR_BLACK); - init_pair(6, COLOR_CYAN, COLOR_BLACK); - init_pair(7, COLOR_MAGENTA, COLOR_BLACK); - init_pair(8, COLOR_RED, COLOR_BLACK); - init_pair(9, COLOR_BLUE, COLOR_BLACK); - } - - return 0; -} - -static int curses_colorpair(struct AbstractTview* tv,int flag) - { - return COLOR_PAIR(flag); - } - -static int curses_drawaln(struct AbstractTview* tv, int tid, int pos) - { - return base_draw_aln(tv, tid, pos); - } - - - -static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos) - { - char str[256], *p; - int i, l = 0; - tview_t *base=(tview_t*)tv; - wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(tv->wgoto, 1, 2, "Goto: "); - for (;;) { - int invalid = 0; - int c = wgetch(tv->wgoto); - wrefresh(tv->wgoto); - if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { - if(l > 0) --l; - } else if (c == KEY_ENTER || c == '\012' || c == '\015') { - int _tid = -1, _beg, _end; - if (str[0] == '=') { - _beg = strtol(str+1, &p, 10) - 1; - if (_beg > 0) { - *pos = _beg; - return; - } - } else { - char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end); - if (name_lim) { - char name_terminator = *name_lim; - *name_lim = '\0'; - _tid = bam_name2id(base->header, str); - *name_lim = name_terminator; - } - else { - // Unparsable region, but possibly a sequence named "foo:a" - _tid = bam_name2id(base->header, str); - _beg = 0; - } - - if (_tid >= 0) { - *tid = _tid; *pos = _beg; - return; - } - } - - // If we get here, the region string is invalid - invalid = 1; - } else if (isgraph(c)) { - if (l < TV_MAX_GOTO) str[l++] = c; - } else if (c == '\027') l = 0; - else if (c == '\033') return; - str[l] = '\0'; - for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); - if (invalid) mvwprintw(tv->wgoto, 1, TV_MAX_GOTO - 1, "[Invalid]"); - mvwprintw(tv->wgoto, 1, 8, "%s", str); - } -} - - - - -static void tv_win_help(curses_tview_t *tv) { - int r = 1; - tview_t* base=(tview_t*)base; - WINDOW *win = tv->whelp; - wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(win, r++, 2, " -=- Help -=- "); - r++; - mvwprintw(win, r++, 2, "? This window"); - mvwprintw(win, r++, 2, "Arrows Small scroll movement"); - mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); - mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); - mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); - mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); - mvwprintw(win, r++, 2, "space Scroll one screen"); - mvwprintw(win, r++, 2, "backspace Scroll back one screen"); - mvwprintw(win, r++, 2, "g Go to specific location"); - mvwprintw(win, r++, 2, "m Color for mapping qual"); - mvwprintw(win, r++, 2, "n Color for nucleotide"); - mvwprintw(win, r++, 2, "b Color for base quality"); - mvwprintw(win, r++, 2, "c Color for cs color"); - mvwprintw(win, r++, 2, "z Color for cs qual"); - mvwprintw(win, r++, 2, ". Toggle on/off dot view"); - mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); - mvwprintw(win, r++, 2, "r Toggle on/off rd name"); - mvwprintw(win, r++, 2, "N Turn on nt view"); - mvwprintw(win, r++, 2, "C Turn on cs view"); - mvwprintw(win, r++, 2, "i Toggle on/off ins"); - mvwprintw(win, r++, 2, "v Inverse video"); - mvwprintw(win, r++, 2, "q Exit"); - r++; - mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); - mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); - mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); - wrefresh(win); - wgetch(win); -} - -static int curses_underline(tview_t* tv) - { - return A_UNDERLINE; - } - -static int curses_loop(tview_t* tv) - { - int tid, pos; - curses_tview_t *CTV=(curses_tview_t *)tv; - tid = tv->curr_tid; pos = tv->left_pos; - while (1) { - int c = getch(); - switch (c) { - case '?': tv_win_help(CTV); break; - case '\033': - case 'q': goto end_loop; - case '/': - case 'g': tv_win_goto(CTV, &tid, &pos); break; - case 'm': tv->color_for = TV_COLOR_MAPQ; break; - case 'b': tv->color_for = TV_COLOR_BASEQ; break; - case 'n': tv->color_for = TV_COLOR_NUCL; break; - case 'c': tv->color_for = TV_COLOR_COL; break; - case 'z': tv->color_for = TV_COLOR_COLQ; break; - case 'v': curses_init_colors(tv->inverse = !tv->inverse); break; - case 's': tv->no_skip = !tv->no_skip; break; - case 'r': tv->show_name = !tv->show_name; break; - case KEY_LEFT: - case 'h': --pos; break; - case KEY_RIGHT: - case 'l': ++pos; break; - case KEY_SLEFT: - case 'H': pos -= 20; break; - case KEY_SRIGHT: - case 'L': pos += 20; break; - case '.': tv->is_dot = !tv->is_dot; break; - case 'N': tv->base_for = TV_BASE_NUCL; break; - case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; - case 'i': tv->ins = !tv->ins; break; - case '\010': pos -= 1000; break; - case '\014': pos += 1000; break; - case ' ': pos += tv->mcol; break; - case KEY_UP: - case 'j': --tv->row_shift; break; - case KEY_DOWN: - case 'k': ++tv->row_shift; break; - case KEY_BACKSPACE: - case '\177': pos -= tv->mcol; break; - case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; - default: continue; - } - if (pos < 0) pos = 0; - if (tv->row_shift < 0) tv->row_shift = 0; - tv->my_drawaln(tv, tid, pos); - } -end_loop: - return 0; -} - - - - -tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t)); - tview_t* base=(tview_t*)tv; - if(tv==0) - { - fprintf(pysam_stderr,"Calloc failed\n"); - return 0; - } - - base_tv_init(base,fn,fn_fa,samples,fmt); - /* initialize callbacks */ -#define SET_CALLBACK(fun) base->my_##fun=curses_##fun; - SET_CALLBACK(destroy); - SET_CALLBACK(mvprintw); - SET_CALLBACK(mvaddch); - SET_CALLBACK(attron); - SET_CALLBACK(attroff); - SET_CALLBACK(clear); - SET_CALLBACK(colorpair); - SET_CALLBACK(drawaln); - SET_CALLBACK(loop); - SET_CALLBACK(underline); -#undef SET_CALLBACK - - initscr(); - keypad(stdscr, TRUE); - clear(); - noecho(); - cbreak(); - - getmaxyx(stdscr, base->mrow, base->mcol); - tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); - tv->whelp = newwin(30, 40, 5, 5); - - start_color(); - curses_init_colors(0); - return base; - } - -#else // !HAVE_CURSES - -#warning "No curses library is available; tview with curses is disabled." - -extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt); - -tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - return text_tv_init(fn,fn_fa,samples,fmt); - } - -#endif diff --git a/samtools/bam_tview_html.c b/samtools/bam_tview_html.c deleted file mode 100644 index e3aecda..0000000 --- a/samtools/bam_tview_html.c +++ /dev/null @@ -1,377 +0,0 @@ -/* bam_tview_html.c -- HTML tview output. - - Copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Pierre Lindenbaum - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include "bam_tview.h" - -#define UNDERLINE_FLAG 10 - -typedef struct HtmlTview { - tview_t view; - int row_count; - tixel_t** screen; - FILE* out; - int attributes;/* color... */ - } html_tview_t; - -#define FROM_TV(ptr) ((html_tview_t*)ptr) - -static void html_destroy(tview_t* base) - { - int i; - html_tview_t* tv=(html_tview_t*)base; - if(tv->screen!=NULL) - { - for(i=0;i< tv->row_count;++i) free(tv->screen[i]); - free(tv->screen); - } - base_tv_destroy(base); - free(tv); - } - -/* - void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); - void (*my_)(struct AbstractTview*,int,int,int); - void (*my_attron)(struct AbstractTview*,int); - void (*my_attroff)(struct AbstractTview*,int); - void (*my_clear)(struct AbstractTview*); - int (*my_colorpair)(struct AbstractTview*,int); -*/ - -static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) - { - int i,nchars=0; - unsigned int size=tv->mcol+2; - char* str=malloc(size); - if(str==0) exit(EXIT_FAILURE); - va_list argptr; - va_start(argptr, fmt); - nchars=vsnprintf(str,size, fmt, argptr); - va_end(argptr); - - for(i=0;i< nchars;++i) - { - tv->my_mvaddch(tv,y,x+i,str[i]); - } - free(str); - } - -static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch) - { - tixel_t* row=NULL; - html_tview_t* ptr=FROM_TV(tv); - if( x >= tv->mcol ) return; //out of screen - while(ptr->row_count<=y) - { - int x; - row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t)); - if(row==0) exit(EXIT_FAILURE); - for(x=0;xmcol;++x) {row[x].ch=' ';row[x].attributes=0;} - ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1)); - ptr->screen[ptr->row_count++]=row; - } - row=ptr->screen[y]; - row[x].ch=ch; - row[x].attributes=ptr->attributes; - } - -static void html_attron(struct AbstractTview* tv,int flag) - { - html_tview_t* ptr=FROM_TV(tv); - ptr->attributes |= flag; - - - } - -static void html_attroff(struct AbstractTview* tv,int flag) - { - html_tview_t* ptr=FROM_TV(tv); - ptr->attributes &= ~(flag); - } - -static void html_clear(struct AbstractTview* tv) - { - html_tview_t* ptr=FROM_TV(tv); - if(ptr->screen!=NULL) - { - int i; - for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]); - free(ptr->screen); - ptr->screen=NULL; - } - ptr->row_count=0; - ptr->attributes=0; - } - -static int html_colorpair(struct AbstractTview* tv,int flag) - { - return (1 << (flag)); - } - -static int html_drawaln(struct AbstractTview* tv, int tid, int pos) - { - int y,x; - html_tview_t* ptr=FROM_TV(tv); - html_clear(tv); - base_draw_aln(tv, tid, pos); - fputs("",ptr->out); - fprintf(ptr->out,"%s:%d", - tv->header->target_name[tid], - pos+1 - ); - //style - - fputs("",ptr->out); - - fputs("",ptr->out); - - fprintf(ptr->out,"
%s:%d
", - tv->header->target_name[tid], - pos+1 - ); - - fputs("
",ptr->out); - for(y=0;y< ptr->row_count;++y) - { - - for(x=0;x< tv->mcol;++x) - { - - - if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes) - { - int css=0; - fprintf(ptr->out,"1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes); - if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) - { - - fprintf(ptr->out," class='tviewc%s%d'", - (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""), - css); - break; - } - ++css; - } - - - fputs(">",ptr->out); - } - - int ch=ptr->screen[y][x].ch; - switch(ch) - { - case '<': fputs("<",ptr->out);break; - case '>': fputs(">",ptr->out);break; - case '&': fputs("&",ptr->out);break; - default: fputc(ch,ptr->out); break; - } - - - if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes) - { - fputs("",ptr->out); - } - } - if(y+1 < ptr->row_count) fputs("
",ptr->out); - } - fputs("
",ptr->out); - return 0; - } - - -#define ANSI_COLOR_RED "\x1b[31m" -#define ANSI_COLOR_GREEN "\x1b[32m" -#define ANSI_COLOR_YELLOW "\x1b[33m" -#define ANSI_COLOR_BLUE "\x1b[34m" -#define ANSI_COLOR_MAGENTA "\x1b[35m" -#define ANSI_COLOR_CYAN "\x1b[36m" -#define ANSI_COLOR_BLACK "\x1b[0m" -#define ANSI_COLOR_RESET ANSI_COLOR_BLACK - -#define ANSI_UNDERLINE_SET "\033[4m" -#define ANSI_UNDERLINE_UNSET "\033[0m" - -static int text_drawaln(struct AbstractTview* tv, int tid, int pos) - { - int y,x; - html_tview_t* ptr=FROM_TV(tv); - html_clear(tv); - base_draw_aln(tv, tid, pos); - int is_term= isatty(fileno(ptr->out)); - - for(y=0;y< ptr->row_count;++y) - { - for(x=0;x< tv->mcol;++x) - { - if(is_term) - { - int css=0; - while(css<32) - { - if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) - { - break; - } - ++css; - } - switch(css) - { - //CSS(0, "black"); - case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break; - case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break; - case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break; - //CSS(4, "black"); - case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break; - case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break; - case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break; - case 8: fputs(ANSI_COLOR_RED,ptr->out); break; - case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break; - default:break; - } - if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) - { - fputs(ANSI_UNDERLINE_SET,ptr->out); - } - - } - - - int ch=ptr->screen[y][x].ch; - - fputc(ch,ptr->out); - if(is_term) - { - fputs(ANSI_COLOR_RESET,ptr->out); - if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) - { - fputs(ANSI_UNDERLINE_UNSET,ptr->out); - } - } - } - fputc('\n',ptr->out); - } - return 0; - } - - -static int html_loop(tview_t* tv) - { - //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); - return 0; - } - -static int html_underline(tview_t* tv) - { - return (1 << UNDERLINE_FLAG); - } - -/* -static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper) - { - - } -*/ - -tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - char* colstr=getenv("COLUMNS"); - html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t)); - tview_t* base=(tview_t*)tv; - if(tv==0) - { - fprintf(stderr,"Calloc failed\n"); - return 0; - } - tv->row_count=0; - tv->screen=NULL; - tv->out=stdout; - tv->attributes=0; - base_tv_init(base,fn,fn_fa,samples,fmt); - /* initialize callbacks */ -#define SET_CALLBACK(fun) base->my_##fun=html_##fun; - SET_CALLBACK(destroy); - SET_CALLBACK(mvprintw); - SET_CALLBACK(mvaddch); - SET_CALLBACK(attron); - SET_CALLBACK(attroff); - SET_CALLBACK(clear); - SET_CALLBACK(colorpair); - SET_CALLBACK(drawaln); - SET_CALLBACK(loop); - SET_CALLBACK(underline); -#undef SET_CALLBACK - - - if(colstr!=0) - { - base->mcol=atoi(colstr); - if(base->mcol<10) base->mcol=80; - } - base->mrow=99999; - -/* - init_pair(tv,1, "blue", "white"); - init_pair(tv,2, "green", "white"); - init_pair(tv,3, "yellow", "white"); - init_pair(tv,4, "white", "white"); - init_pair(tv,5, "green", "white"); - init_pair(tv,6, "cyan", "white"); - init_pair(tv,7, "yellow", "white"); - init_pair(tv,8, "red", "white"); - init_pair(tv,9, "blue", "white"); - */ - return base; - } - - -tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt); - tv->my_drawaln=text_drawaln; - return tv; - } - diff --git a/samtools/bam_tview_html.c.pysam.c b/samtools/bam_tview_html.c.pysam.c deleted file mode 100644 index 164e33d..0000000 --- a/samtools/bam_tview_html.c.pysam.c +++ /dev/null @@ -1,379 +0,0 @@ -#include "pysam.h" - -/* bam_tview_html.c -- HTML tview output. - - Copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, UniversitÃ© de Nantes. - - Author: Pierre Lindenbaum - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include "bam_tview.h" - -#define UNDERLINE_FLAG 10 - -typedef struct HtmlTview { - tview_t view; - int row_count; - tixel_t** screen; - FILE* out; - int attributes;/* color... */ - } html_tview_t; - -#define FROM_TV(ptr) ((html_tview_t*)ptr) - -static void html_destroy(tview_t* base) - { - int i; - html_tview_t* tv=(html_tview_t*)base; - if(tv->screen!=NULL) - { - for(i=0;i< tv->row_count;++i) free(tv->screen[i]); - free(tv->screen); - } - base_tv_destroy(base); - free(tv); - } - -/* - void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); - void (*my_)(struct AbstractTview*,int,int,int); - void (*my_attron)(struct AbstractTview*,int); - void (*my_attroff)(struct AbstractTview*,int); - void (*my_clear)(struct AbstractTview*); - int (*my_colorpair)(struct AbstractTview*,int); -*/ - -static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) - { - int i,nchars=0; - unsigned int size=tv->mcol+2; - char* str=malloc(size); - if(str==0) exit(EXIT_FAILURE); - va_list argptr; - va_start(argptr, fmt); - nchars=vsnprintf(str,size, fmt, argptr); - va_end(argptr); - - for(i=0;i< nchars;++i) - { - tv->my_mvaddch(tv,y,x+i,str[i]); - } - free(str); - } - -static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch) - { - tixel_t* row=NULL; - html_tview_t* ptr=FROM_TV(tv); - if( x >= tv->mcol ) return; //out of screen - while(ptr->row_count<=y) - { - int x; - row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t)); - if(row==0) exit(EXIT_FAILURE); - for(x=0;xmcol;++x) {row[x].ch=' ';row[x].attributes=0;} - ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1)); - ptr->screen[ptr->row_count++]=row; - } - row=ptr->screen[y]; - row[x].ch=ch; - row[x].attributes=ptr->attributes; - } - -static void html_attron(struct AbstractTview* tv,int flag) - { - html_tview_t* ptr=FROM_TV(tv); - ptr->attributes |= flag; - - - } - -static void html_attroff(struct AbstractTview* tv,int flag) - { - html_tview_t* ptr=FROM_TV(tv); - ptr->attributes &= ~(flag); - } - -static void html_clear(struct AbstractTview* tv) - { - html_tview_t* ptr=FROM_TV(tv); - if(ptr->screen!=NULL) - { - int i; - for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]); - free(ptr->screen); - ptr->screen=NULL; - } - ptr->row_count=0; - ptr->attributes=0; - } - -static int html_colorpair(struct AbstractTview* tv,int flag) - { - return (1 << (flag)); - } - -static int html_drawaln(struct AbstractTview* tv, int tid, int pos) - { - int y,x; - html_tview_t* ptr=FROM_TV(tv); - html_clear(tv); - base_draw_aln(tv, tid, pos); - fputs("",ptr->out); - fprintf(ptr->out,"%s:%d", - tv->header->target_name[tid], - pos+1 - ); - //style - - fputs("",ptr->out); - - fputs("",ptr->out); - - fprintf(ptr->out,"
%s:%d
", - tv->header->target_name[tid], - pos+1 - ); - - fputs("
",ptr->out); - for(y=0;y< ptr->row_count;++y) - { - - for(x=0;x< tv->mcol;++x) - { - - - if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes) - { - int css=0; - fprintf(ptr->out,"1) fprintf(pysam_stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes); - if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) - { - - fprintf(ptr->out," class='tviewc%s%d'", - (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""), - css); - break; - } - ++css; - } - - - fputs(">",ptr->out); - } - - int ch=ptr->screen[y][x].ch; - switch(ch) - { - case '<': fputs("<",ptr->out);break; - case '>': fputs(">",ptr->out);break; - case '&': fputs("&",ptr->out);break; - default: fputc(ch,ptr->out); break; - } - - - if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes) - { - fputs("",ptr->out); - } - } - if(y+1 < ptr->row_count) fputs("
",ptr->out); - } - fputs("
",ptr->out); - return 0; - } - - -#define ANSI_COLOR_RED "\x1b[31m" -#define ANSI_COLOR_GREEN "\x1b[32m" -#define ANSI_COLOR_YELLOW "\x1b[33m" -#define ANSI_COLOR_BLUE "\x1b[34m" -#define ANSI_COLOR_MAGENTA "\x1b[35m" -#define ANSI_COLOR_CYAN "\x1b[36m" -#define ANSI_COLOR_BLACK "\x1b[0m" -#define ANSI_COLOR_RESET ANSI_COLOR_BLACK - -#define ANSI_UNDERLINE_SET "\033[4m" -#define ANSI_UNDERLINE_UNSET "\033[0m" - -static int text_drawaln(struct AbstractTview* tv, int tid, int pos) - { - int y,x; - html_tview_t* ptr=FROM_TV(tv); - html_clear(tv); - base_draw_aln(tv, tid, pos); - int is_term= isatty(fileno(ptr->out)); - - for(y=0;y< ptr->row_count;++y) - { - for(x=0;x< tv->mcol;++x) - { - if(is_term) - { - int css=0; - while(css<32) - { - if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) - { - break; - } - ++css; - } - switch(css) - { - //CSS(0, "black"); - case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break; - case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break; - case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break; - //CSS(4, "black"); - case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break; - case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break; - case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break; - case 8: fputs(ANSI_COLOR_RED,ptr->out); break; - case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break; - default:break; - } - if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) - { - fputs(ANSI_UNDERLINE_SET,ptr->out); - } - - } - - - int ch=ptr->screen[y][x].ch; - - fputc(ch,ptr->out); - if(is_term) - { - fputs(ANSI_COLOR_RESET,ptr->out); - if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) - { - fputs(ANSI_UNDERLINE_UNSET,ptr->out); - } - } - } - fputc('\n',ptr->out); - } - return 0; - } - - -static int html_loop(tview_t* tv) - { - //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); - return 0; - } - -static int html_underline(tview_t* tv) - { - return (1 << UNDERLINE_FLAG); - } - -/* -static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper) - { - - } -*/ - -tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - char* colstr=getenv("COLUMNS"); - html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t)); - tview_t* base=(tview_t*)tv; - if(tv==0) - { - fprintf(pysam_stderr,"Calloc failed\n"); - return 0; - } - tv->row_count=0; - tv->screen=NULL; - tv->out=pysam_stdout; - tv->attributes=0; - base_tv_init(base,fn,fn_fa,samples,fmt); - /* initialize callbacks */ -#define SET_CALLBACK(fun) base->my_##fun=html_##fun; - SET_CALLBACK(destroy); - SET_CALLBACK(mvprintw); - SET_CALLBACK(mvaddch); - SET_CALLBACK(attron); - SET_CALLBACK(attroff); - SET_CALLBACK(clear); - SET_CALLBACK(colorpair); - SET_CALLBACK(drawaln); - SET_CALLBACK(loop); - SET_CALLBACK(underline); -#undef SET_CALLBACK - - - if(colstr!=0) - { - base->mcol=atoi(colstr); - if(base->mcol<10) base->mcol=80; - } - base->mrow=99999; - -/* - init_pair(tv,1, "blue", "white"); - init_pair(tv,2, "green", "white"); - init_pair(tv,3, "yellow", "white"); - init_pair(tv,4, "white", "white"); - init_pair(tv,5, "green", "white"); - init_pair(tv,6, "cyan", "white"); - init_pair(tv,7, "yellow", "white"); - init_pair(tv,8, "red", "white"); - init_pair(tv,9, "blue", "white"); - */ - return base; - } - - -tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples, - const htsFormat *fmt) - { - tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt); - tv->my_drawaln=text_drawaln; - return tv; - } - diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c index 044bc4e..e24689e 100644 --- a/samtools/bamshuf.c +++ b/samtools/bamshuf.c @@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "htslib/ksort.h" #include "samtools.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #define DEF_CLEVEL 1 @@ -86,6 +87,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, bam_hdr_t *h = NULL; int64_t j, max_cnt = 0, *cnt = NULL; elem_t *a = NULL; + htsThreadPool p = {NULL, 0}; + + if (ga->nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga->nthreads))) { + print_error_errno("collate", "Error creating thread pool\n"); + return 1; + } + } // Read input, distribute reads pseudo-randomly into n_files temporary // files. @@ -94,6 +103,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, print_error_errno("collate", "Cannot open input file \"%s\"", fn); return 1; } + if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); h = sam_hdr_read(fp); if (h == NULL) { @@ -173,6 +183,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre); goto fail; } + if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p); if (sam_hdr_write(fpw, h) < 0) { print_error_errno("collate", "Couldn't write header"); @@ -193,6 +204,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]); goto fail; } + if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header // Slurp in one of the split files @@ -228,6 +240,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, return 1; } + if (p.pool) hts_tpool_destroy(p.pool); return 0; mem_fail: @@ -249,13 +262,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, free(fnt); free(fpt); free(cnt); + if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(ga); return 1; } static int usage(FILE *fp, int n_files) { fprintf(fp, - "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] \n\n" + "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] \n\n" "Options:\n" " -O output to stdout\n" " -u uncompressed BAM output\n" @@ -263,7 +277,7 @@ static int usage(FILE *fp, int n_files) { " -n INT number of temporary files [%d]\n", // n_files DEF_CLEVEL, n_files); - sam_global_opt_help(fp, "-...."); + sam_global_opt_help(fp, "-....@"); return 1; } @@ -273,11 +287,11 @@ int main_bamshuf(int argc, char *argv[]) int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) { switch (c) { case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c index fb1a5ac..04cd37b 100644 --- a/samtools/bamshuf.c.pysam.c +++ b/samtools/bamshuf.c.pysam.c @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "htslib/ksort.h" #include "samtools.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #define DEF_CLEVEL 1 @@ -88,6 +89,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, bam_hdr_t *h = NULL; int64_t j, max_cnt = 0, *cnt = NULL; elem_t *a = NULL; + htsThreadPool p = {NULL, 0}; + + if (ga->nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga->nthreads))) { + print_error_errno("collate", "Error creating thread pool\n"); + return 1; + } + } // Read input, distribute reads pseudo-randomly into n_files temporary // files. @@ -96,6 +105,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, print_error_errno("collate", "Cannot open input file \"%s\"", fn); return 1; } + if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); h = sam_hdr_read(fp); if (h == NULL) { @@ -175,6 +185,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre); goto fail; } + if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p); if (sam_hdr_write(fpw, h) < 0) { print_error_errno("collate", "Couldn't write header"); @@ -195,6 +206,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]); goto fail; } + if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header // Slurp in one of the split files @@ -230,6 +242,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, return 1; } + if (p.pool) hts_tpool_destroy(p.pool); return 0; mem_fail: @@ -251,13 +264,14 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, free(fnt); free(fpt); free(cnt); + if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(ga); return 1; } static int usage(FILE *fp, int n_files) { fprintf(fp, - "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] \n\n" + "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] \n\n" "Options:\n" " -O output to pysam_stdout\n" " -u uncompressed BAM output\n" @@ -265,7 +279,7 @@ static int usage(FILE *fp, int n_files) { " -n INT number of temporary files [%d]\n", // n_files DEF_CLEVEL, n_files); - sam_global_opt_help(fp, "-...."); + sam_global_opt_help(fp, "-....@"); return 1; } @@ -275,11 +289,11 @@ int main_bamshuf(int argc, char *argv[]) int c, n_files = 64, clevel = DEF_CLEVEL, is_pysam_stdout = 0, is_un = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) { switch (c) { case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; diff --git a/samtools/bamtk.c b/samtools/bamtk.c index 5c1c60d..bd520b6 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2016 Genome Research Ltd. + Copyright (C) 2008-2017 Genome Research Ltd. Author: Heng Li @@ -27,9 +27,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include -#include + #include "htslib/hts.h" #include "samtools.h" #include "version.h" @@ -69,34 +68,6 @@ const char *samtools_version() return SAMTOOLS_VERSION; } -static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) -{ - fflush(stdout); - if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand); - else fprintf(stderr, "samtools: "); - vfprintf(stderr, format, args); - if (extra) fprintf(stderr, ": %s\n", extra); - else fprintf(stderr, "\n"); - fflush(stderr); -} - -void print_error(const char *subcommand, const char *format, ...) -{ - va_list args; - va_start(args, format); - vprint_error_core(subcommand, format, args, NULL); - va_end(args); -} - -void print_error_errno(const char *subcommand, const char *format, ...) -{ - int err = errno; - va_list args; - va_start(args, format); - vprint_error_core(subcommand, format, args, strerror(err)); - va_end(args); -} - static void usage(FILE *fp) { /* Please improve the grouping */ @@ -215,7 +186,7 @@ int main(int argc, char *argv[]) printf( "samtools %s\n" "Using htslib %s\n" -"Copyright (C) 2016 Genome Research Ltd.\n", +"Copyright (C) 2017 Genome Research Ltd.\n", samtools_version(), hts_version()); } else if (strcmp(argv[1], "--version-only") == 0) { diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index 1f3d938..8956b1f 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -2,7 +2,7 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2016 Genome Research Ltd. + Copyright (C) 2008-2017 Genome Research Ltd. Author: Heng Li @@ -29,9 +29,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include -#include + #include "htslib/hts.h" #include "samtools.h" #include "version.h" @@ -41,7 +40,7 @@ int bam_mpileup(int argc, char *argv[]); int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); int bam_sort(int argc, char *argv[]); -int bam_tview_main(int argc, char *argv[]); +/* AH: int bam_tview_main(int argc, char *argv[]); */ int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); @@ -71,34 +70,6 @@ const char *samtools_version() return SAMTOOLS_VERSION; } -static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) -{ - fflush(pysam_stdout); - if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand); - else fprintf(pysam_stderr, "samtools: "); - vfprintf(pysam_stderr, format, args); - if (extra) fprintf(pysam_stderr, ": %s\n", extra); - else fprintf(pysam_stderr, "\n"); - fflush(pysam_stderr); -} - -void print_error(const char *subcommand, const char *format, ...) -{ - va_list args; - va_start(args, format); - vprint_error_core(subcommand, format, args, NULL); - va_end(args); -} - -void print_error_errno(const char *subcommand, const char *format, ...) -{ - int err = errno; - va_list args; - va_start(args, format); - vprint_error_core(subcommand, format, args, strerror(err)); - va_end(args); -} - static void usage(FILE *fp) { /* Please improve the grouping */ @@ -212,12 +183,12 @@ int samtools_main(int argc, char *argv[]) fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; } - else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); +/* AH: else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); */ else if (strcmp(argv[1], "--version") == 0) { fprintf(pysam_stdout, "samtools %s\n" "Using htslib %s\n" -"Copyright (C) 2016 Genome Research Ltd.\n", +"Copyright (C) 2017 Genome Research Ltd.\n", samtools_version(), hts_version()); } else if (strcmp(argv[1], "--version-only") == 0) { diff --git a/samtools/bedcov.c b/samtools/bedcov.c index d4dceee..1113e17 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/kstring.h" #include "htslib/sam.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "htslib/kseq.h" @@ -74,7 +75,7 @@ int main_bedcov(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), { NULL, 0, NULL, 0 } }; @@ -89,8 +90,9 @@ int main_bedcov(int argc, char *argv[]) } if (usage || optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov [options] [...]\n\n"); - fprintf(stderr, " -Q INT Only count bases of at least INT quality [0]\n"); - sam_global_opt_help(stderr, "-.--."); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -Q mapping quality threshold [0]\n"); + sam_global_opt_help(stderr, "-.--.-"); return 1; } memset(&str, 0, sizeof(kstring_t)); diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index 25fdffc..3fd6d4c 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/kstring.h" #include "htslib/sam.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "htslib/kseq.h" @@ -76,7 +77,7 @@ int main_bedcov(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), { NULL, 0, NULL, 0 } }; @@ -91,8 +92,9 @@ int main_bedcov(int argc, char *argv[]) } if (usage || optind + 2 > argc) { fprintf(pysam_stderr, "Usage: samtools bedcov [options] [...]\n\n"); - fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n"); - sam_global_opt_help(pysam_stderr, "-.--."); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -Q mapping quality threshold [0]\n"); + sam_global_opt_help(pysam_stderr, "-.--.-"); return 1; } memset(&str, 0, sizeof(kstring_t)); diff --git a/samtools/cut_target.c b/samtools/cut_target.c index 71a6c85..7d541fa 100644 --- a/samtools/cut_target.c +++ b/samtools/cut_target.c @@ -1,7 +1,7 @@ /* cut_target.c -- targetcut subcommand. Copyright (C) 2011 Broad Institute. - Copyright (C) 2012-2013, 2015 Genome Research Ltd. + Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd. Author: Heng Li @@ -28,9 +28,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/hts.h" #include "htslib/sam.h" -#include "errmod.h" #include "htslib/faidx.h" +#include "samtools.h" #include "sam_opts.h" #define ERR_DEP 0.83 @@ -146,7 +147,6 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) static int read_aln(void *data, bam1_t *b) { - extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag); ct_t *g = (ct_t*)data; int ret; while (1) @@ -160,7 +160,7 @@ static int read_aln(void *data, bam1_t *b) g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len); g->tid = b->core.tid; } - bam_prob_realn_core(b, g->ref, g->len, 1<<1|1); + sam_prob_realn(b, g->ref, g->len, 1<<1|1); } break; } @@ -177,7 +177,7 @@ int main_cut_target(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'), { NULL, 0, NULL, 0 } }; @@ -201,14 +201,19 @@ int main_cut_target(int argc, char *argv[]) } if (usage || argc == optind) { fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); - sam_global_opt_help(stderr, "-.--f"); + sam_global_opt_help(stderr, "-.--f-"); return 1; } l = max_l = 0; cns = 0; g.fp = sam_open_format(argv[optind], "r", &ga.in); + if (g.fp == NULL) { + print_error_errno("targetcut", "can't open \"%s\"", argv[optind]); + return 1; + } + g.h = sam_hdr_read(g.fp); if (g.h == NULL) { - fprintf(stderr, "Couldn't read header for '%s'\n", argv[optind]); + print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]); sam_close(g.fp); return 1; } diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c index 82a4c4c..e55f749 100644 --- a/samtools/cut_target.c.pysam.c +++ b/samtools/cut_target.c.pysam.c @@ -3,7 +3,7 @@ /* cut_target.c -- targetcut subcommand. Copyright (C) 2011 Broad Institute. - Copyright (C) 2012-2013, 2015 Genome Research Ltd. + Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd. Author: Heng Li @@ -30,9 +30,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/hts.h" #include "htslib/sam.h" -#include "errmod.h" #include "htslib/faidx.h" +#include "samtools.h" #include "sam_opts.h" #define ERR_DEP 0.83 @@ -148,7 +149,6 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) static int read_aln(void *data, bam1_t *b) { - extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag); ct_t *g = (ct_t*)data; int ret; while (1) @@ -162,7 +162,7 @@ static int read_aln(void *data, bam1_t *b) g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len); g->tid = b->core.tid; } - bam_prob_realn_core(b, g->ref, g->len, 1<<1|1); + sam_prob_realn(b, g->ref, g->len, 1<<1|1); } break; } @@ -179,7 +179,7 @@ int main_cut_target(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'), { NULL, 0, NULL, 0 } }; @@ -203,14 +203,19 @@ int main_cut_target(int argc, char *argv[]) } if (usage || argc == optind) { fprintf(pysam_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); - sam_global_opt_help(pysam_stderr, "-.--f"); + sam_global_opt_help(pysam_stderr, "-.--f-"); return 1; } l = max_l = 0; cns = 0; g.fp = sam_open_format(argv[optind], "r", &ga.in); + if (g.fp == NULL) { + print_error_errno("targetcut", "can't open \"%s\"", argv[optind]); + return 1; + } + g.h = sam_hdr_read(g.fp); if (g.h == NULL) { - fprintf(pysam_stderr, "Couldn't read header for '%s'\n", argv[optind]); + print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]); sam_close(g.fp); return 1; } diff --git a/samtools/errmod.c b/samtools/errmod.c deleted file mode 100644 index c37c6d1..0000000 --- a/samtools/errmod.c +++ /dev/null @@ -1,194 +0,0 @@ -/* errmod.c -- revised MAQ error model. - - Copyright (C) 2010 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include "errmod.h" -#include "htslib/ksort.h" -KSORT_INIT_GENERIC(uint16_t) - -/* table of constants generated for given depcorr and eta */ -typedef struct __errmod_coef_t { - double *fk, *beta, *lhet; -} errmod_coef_t; - -typedef struct { - double fsum[16], bsum[16]; - uint32_t c[16]; -} call_aux_t; - -/* \Gamma(n) = (n-1)! */ -#define lfact(n) lgamma(n+1) - -/* generates a success * trials table of bionomial probability densities (log transformed) */ -static double* logbinomial_table( const int n_size ) -{ - /* prob distribution for binom var is p(k) = {n! \over k! (n-k)! } p^k (1-p)^{n-k} */ - /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */ - int k, n; - double *logbinom = (double*)calloc(n_size * n_size, sizeof(double)); - for (n = 1; n < n_size; ++n) { - double lfn = lfact(n); - for (k = 1; k <= n; ++k) - logbinom[n<<8|k] = lfn - lfact(k) - lfact(n-k); - } - return logbinom; -} - -static errmod_coef_t *cal_coef(double depcorr, double eta) -{ - int k, n, q; - long double sum, sum1; - double *lC; - errmod_coef_t *ec; - - ec = calloc(1, sizeof(errmod_coef_t)); - // initialize ->fk - ec->fk = (double*)calloc(256, sizeof(double)); - ec->fk[0] = 1.0; - for (n = 1; n < 256; ++n) - ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta; - // initialize ->coef - ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double)); - - lC = logbinomial_table( 256 ); - - for (q = 1; q < 64; ++q) { - double e = pow(10.0, -q/10.0); - double le = log(e); - double le1 = log(1.0 - e); - for (n = 1; n <= 255; ++n) { - double *beta = ec->beta + (q<<16|n<<8); - sum1 = sum = 0.0; - for (k = n; k >= 0; --k, sum1 = sum) { - sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1); - beta[k] = -10. / M_LN10 * logl(sum1 / sum); - } - } - } - // initialize ->lhet - ec->lhet = (double*)calloc(256 * 256, sizeof(double)); - for (n = 0; n < 256; ++n) - for (k = 0; k < 256; ++k) - ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n; - free(lC); - return ec; -} - -/** - * Create errmod_t object with obj.depcorr set to depcorr and initialise - */ -errmod_t *errmod_init(double depcorr) -{ - errmod_t *em; - em = (errmod_t*)calloc(1, sizeof(errmod_t)); - em->depcorr = depcorr; - em->coef = cal_coef(depcorr, 0.03); - return em; -} - -/** - * Deallocate an errmod_t object - */ -void errmod_destroy(errmod_t *em) -{ - if (em == 0) return; - free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta); - free(em->coef); free(em); -} - -// -// em: error model to fit to data -// m: number of alleles across all samples -// n: number of bases observed in sample -// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base] -// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j) -int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) -{ - // Aux - // aux.c is total count of each base observed (ignoring strand) - call_aux_t aux; - // Loop variables - int i, j, k; - // The total count of each base observed per strand - int w[32]; - - memset(q, 0, m * m * sizeof(float)); // initialise q to 0 - if (n == 0) return 0; - // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix - if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255 - ks_shuffle(uint16_t, n, bases); - n = 255; - } - ks_introsort(uint16_t, n, bases); - /* zero out w and aux */ - memset(w, 0, 32 * sizeof(int)); - memset(&aux, 0, sizeof(call_aux_t)); - - for (j = n - 1; j >= 0; --j) { // calculate esum and fsum - uint16_t b = bases[j]; - /* extract quality and cap at 63 */ - int qual = b>>5 < 4? 4 : b>>5; - if (qual > 63) qual = 63; - /* extract base ORed with strand */ - int basestrand = b&0x1f; - /* extract base */ - int base = b&0xf; - aux.fsum[base] += em->coef->fk[w[basestrand]]; - aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]]; - ++aux.c[base]; - ++w[basestrand]; - } - - // generate likelihood - for (j = 0; j < m; ++j) { - float tmp1, tmp3; - int tmp2; - // homozygous - for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) { - if (k == j) continue; - tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; - } - if (tmp2) { - q[j*m+j] = tmp1; - } - // heterozygous - for (k = j + 1; k < m; ++k) { - int cjk = aux.c[j] + aux.c[k]; - for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { - if (i == j || i == k) continue; - tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; - } - if (tmp2) { - q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; - } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k - } - /* clamp to greater than 0 */ - for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; - } - - return 0; -} diff --git a/samtools/errmod.c.pysam.c b/samtools/errmod.c.pysam.c deleted file mode 100644 index 12176cf..0000000 --- a/samtools/errmod.c.pysam.c +++ /dev/null @@ -1,196 +0,0 @@ -#include "pysam.h" - -/* errmod.c -- revised MAQ error model. - - Copyright (C) 2010 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include "errmod.h" -#include "htslib/ksort.h" -KSORT_INIT_GENERIC(uint16_t) - -/* table of constants generated for given depcorr and eta */ -typedef struct __errmod_coef_t { - double *fk, *beta, *lhet; -} errmod_coef_t; - -typedef struct { - double fsum[16], bsum[16]; - uint32_t c[16]; -} call_aux_t; - -/* \Gamma(n) = (n-1)! */ -#define lfact(n) lgamma(n+1) - -/* generates a success * trials table of bionomial probability densities (log transformed) */ -static double* logbinomial_table( const int n_size ) -{ - /* prob distribution for binom var is p(k) = {n! \over k! (n-k)! } p^k (1-p)^{n-k} */ - /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */ - int k, n; - double *logbinom = (double*)calloc(n_size * n_size, sizeof(double)); - for (n = 1; n < n_size; ++n) { - double lfn = lfact(n); - for (k = 1; k <= n; ++k) - logbinom[n<<8|k] = lfn - lfact(k) - lfact(n-k); - } - return logbinom; -} - -static errmod_coef_t *cal_coef(double depcorr, double eta) -{ - int k, n, q; - long double sum, sum1; - double *lC; - errmod_coef_t *ec; - - ec = calloc(1, sizeof(errmod_coef_t)); - // initialize ->fk - ec->fk = (double*)calloc(256, sizeof(double)); - ec->fk[0] = 1.0; - for (n = 1; n < 256; ++n) - ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta; - // initialize ->coef - ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double)); - - lC = logbinomial_table( 256 ); - - for (q = 1; q < 64; ++q) { - double e = pow(10.0, -q/10.0); - double le = log(e); - double le1 = log(1.0 - e); - for (n = 1; n <= 255; ++n) { - double *beta = ec->beta + (q<<16|n<<8); - sum1 = sum = 0.0; - for (k = n; k >= 0; --k, sum1 = sum) { - sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1); - beta[k] = -10. / M_LN10 * logl(sum1 / sum); - } - } - } - // initialize ->lhet - ec->lhet = (double*)calloc(256 * 256, sizeof(double)); - for (n = 0; n < 256; ++n) - for (k = 0; k < 256; ++k) - ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n; - free(lC); - return ec; -} - -/** - * Create errmod_t object with obj.depcorr set to depcorr and initialise - */ -errmod_t *errmod_init(double depcorr) -{ - errmod_t *em; - em = (errmod_t*)calloc(1, sizeof(errmod_t)); - em->depcorr = depcorr; - em->coef = cal_coef(depcorr, 0.03); - return em; -} - -/** - * Deallocate an errmod_t object - */ -void errmod_destroy(errmod_t *em) -{ - if (em == 0) return; - free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta); - free(em->coef); free(em); -} - -// -// em: error model to fit to data -// m: number of alleles across all samples -// n: number of bases observed in sample -// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base] -// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j) -int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) -{ - // Aux - // aux.c is total count of each base observed (ignoring strand) - call_aux_t aux; - // Loop variables - int i, j, k; - // The total count of each base observed per strand - int w[32]; - - memset(q, 0, m * m * sizeof(float)); // initialise q to 0 - if (n == 0) return 0; - // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix - if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255 - ks_shuffle(uint16_t, n, bases); - n = 255; - } - ks_introsort(uint16_t, n, bases); - /* zero out w and aux */ - memset(w, 0, 32 * sizeof(int)); - memset(&aux, 0, sizeof(call_aux_t)); - - for (j = n - 1; j >= 0; --j) { // calculate esum and fsum - uint16_t b = bases[j]; - /* extract quality and cap at 63 */ - int qual = b>>5 < 4? 4 : b>>5; - if (qual > 63) qual = 63; - /* extract base ORed with strand */ - int basestrand = b&0x1f; - /* extract base */ - int base = b&0xf; - aux.fsum[base] += em->coef->fk[w[basestrand]]; - aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]]; - ++aux.c[base]; - ++w[basestrand]; - } - - // generate likelihood - for (j = 0; j < m; ++j) { - float tmp1, tmp3; - int tmp2; - // homozygous - for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) { - if (k == j) continue; - tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; - } - if (tmp2) { - q[j*m+j] = tmp1; - } - // heterozygous - for (k = j + 1; k < m; ++k) { - int cjk = aux.c[j] + aux.c[k]; - for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { - if (i == j || i == k) continue; - tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; - } - if (tmp2) { - q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; - } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k - } - /* clamp to greater than 0 */ - for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; - } - - return 0; -} diff --git a/samtools/errmod.h b/samtools/errmod.h deleted file mode 100644 index 6db46f4..0000000 --- a/samtools/errmod.h +++ /dev/null @@ -1,49 +0,0 @@ -/* errmod.h -- revised MAQ error model. - - Copyright (C) 2010 Broad Institute. - Copyright (C) 2012 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef ERRMOD_H -#define ERRMOD_H - -#include - -struct __errmod_coef_t; - -typedef struct { - double depcorr; - struct __errmod_coef_t *coef; -} errmod_t; - -errmod_t *errmod_init(double depcorr); -void errmod_destroy(errmod_t *em); - -/* - n: number of bases - m: maximum base - bases[i]: qual:6, strand:1, base:4 - q[i*m+j]: phred-scaled likelihood of (i,j) - */ -int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); - -#endif diff --git a/samtools/faidx.c b/samtools/faidx.c index 336bde5..c5c9ed6 100644 --- a/samtools/faidx.c +++ b/samtools/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- faidx subcommand. - Copyright (C) 2008, 2009, 2013 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2016 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -25,34 +25,19 @@ DEALINGS IN THE SOFTWARE. */ #include -#include -#include #include #include -#include #include -#include + #include +#include "samtools.h" -static void error(const char *format, ...) +static int usage(FILE *fp, int exit_status) { - if ( format ) - { - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); - } - else - { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools faidx [ [...]]\n"); - fprintf(stderr, "\n"); - } - exit(-1); + fprintf(fp, "Usage: samtools faidx [ [...]]\n"); + return exit_status; } - int faidx_main(int argc, char *argv[]) { int c; @@ -61,39 +46,60 @@ int faidx_main(int argc, char *argv[]) switch(c) { case 'h': + return usage(stdout, EXIT_SUCCESS); + default: - error(NULL); + return usage(stderr, EXIT_FAILURE); } } if ( argc==optind ) - error(NULL); + return usage(stdout, EXIT_SUCCESS); if ( argc==2 ) { if (fai_build(argv[optind]) != 0) { - error("Could not build fai index %s.fai\n", argv[optind]); + fprintf(stderr, "Could not build fai index %s.fai\n", argv[optind]); + return EXIT_FAILURE; } return 0; } faidx_t *fai = fai_load(argv[optind]); - if ( !fai ) error("Could not load fai index of %s\n", argv[optind]); + if ( !fai ) { + fprintf(stderr, "Could not load fai index of %s\n", argv[optind]); + return EXIT_FAILURE; + } + + int exit_status = EXIT_SUCCESS; - while ( ++optind%s\n", argv[optind]); - int i, j, seq_len; + int seq_len; char *seq = fai_fetch(fai, argv[optind], &seq_len); - if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]); - for (i=0; i @@ -27,34 +27,19 @@ DEALINGS IN THE SOFTWARE. */ #include -#include -#include #include #include -#include #include -#include + #include +#include "samtools.h" -static void error(const char *format, ...) +static int usage(FILE *fp, int exit_status) { - if ( format ) - { - va_list ap; - va_start(ap, format); - vfprintf(pysam_stderr, format, ap); - va_end(ap); - } - else - { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Usage: samtools faidx [ [...]]\n"); - fprintf(pysam_stderr, "\n"); - } - exit(-1); + fprintf(fp, "Usage: samtools faidx [ [...]]\n"); + return exit_status; } - int faidx_main(int argc, char *argv[]) { int c; @@ -63,39 +48,60 @@ int faidx_main(int argc, char *argv[]) switch(c) { case 'h': + return usage(pysam_stdout, EXIT_SUCCESS); + default: - error(NULL); + return usage(pysam_stderr, EXIT_FAILURE); } } if ( argc==optind ) - error(NULL); + return usage(pysam_stdout, EXIT_SUCCESS); if ( argc==2 ) { if (fai_build(argv[optind]) != 0) { - error("Could not build fai index %s.fai\n", argv[optind]); + fprintf(pysam_stderr, "Could not build fai index %s.fai\n", argv[optind]); + return EXIT_FAILURE; } return 0; } faidx_t *fai = fai_load(argv[optind]); - if ( !fai ) error("Could not load fai index of %s\n", argv[optind]); + if ( !fai ) { + fprintf(pysam_stderr, "Could not load fai index of %s\n", argv[optind]); + return EXIT_FAILURE; + } + + int exit_status = EXIT_SUCCESS; - while ( ++optind%s\n", argv[optind]); - int i, j, seq_len; + int seq_len; char *seq = fai_fetch(fai, argv[optind], &seq_len); - if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]); - for (i=0; i - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include - -#include -#include -#include -#include -#include -#include "kprobaln.h" - -/***************************************** - * Probabilistic banded glocal alignment * - *****************************************/ - -#define EI .25 -#define EM .33333333333 - -static float g_qual2prob[256]; - -#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; } - -kpa_par_t kpa_par_def = { 0.001, 0.1, 10 }; -kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 }; - -/* - The topology of the profile HMM: - - /\ /\ /\ /\ - I[1] I[k-1] I[k] I[L] - ^ \ \ ^ \ ^ \ \ ^ - | \ \ | \ | \ \ | - M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1] - \ \/ \/ \/ / - \ /\ /\ /\ / - -> D[k-1] -> D[k] -> - - M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1]. - - On input, _ref is the reference sequence and _query is the query - sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an - ambiguous residue. iqual is the base quality. c sets the gap open - probability, gap extension probability and band width. - - On output, state and q are arrays of length l_query. The higher 30 - bits give the reference position the query base is matched to and the - lower two bits can be 0 (an alignment match) or 1 (an - insertion). q[i] gives the phred scaled posterior probability of - state[i] being wrong. - */ -int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, - const kpa_par_t *c, int *state, uint8_t *q) -{ - double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb; - float *qual, *_qual; - const uint8_t *ref, *query; - int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr; - - if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault - - /*** initialization ***/ - is_backward = state && q? 1 : 0; - ref = _ref - 1; query = _query - 1; // change to 1-based coordinate - bw = l_ref > l_query? l_ref : l_query; - if (bw > c->bw) bw = c->bw; - if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query); - bw2 = bw * 2 + 1; - // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] - f = calloc(l_query+1, sizeof(double*)); - if (is_backward) b = calloc(l_query+1, sizeof(double*)); - for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0 - f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs - if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double)); - } - s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow - // initialize qual - _qual = calloc(l_query, sizeof(float)); - if (g_qual2prob[0] == 0) - for (i = 0; i < 256; ++i) - g_qual2prob[i] = pow(10, -i/10.); - for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30]; - qual = _qual - 1; - // initialize transition probability - sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof - m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); - m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; - m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; - bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 - /*** forward ***/ - // f[0] - set_u(k, bw, 0, 0); - f[0][k] = s[0] = 1.; - { // f[1] - double *fi = f[1], sum; - int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end; - for (k = beg, sum = 0.; k <= end; ++k) { - int u; - double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; - set_u(u, bw, 1, k); - fi[u+0] = e * bM; fi[u+1] = EI * bI; - sum += fi[u] + fi[u+1]; - } - // rescale - s[1] = sum; - set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2; - for (k = _beg; k <= _end; ++k) fi[k] /= sum; - } - // f[2..l_query] - for (i = 2; i <= l_query; ++i) { - double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i]; - int beg = 1, end = l_ref, x, _beg, _end; - uint8_t qyi = query[i]; - x = i - bw; beg = beg > x? beg : x; // band start - x = i + bw; end = end < x? end : x; // band end - for (k = beg, sum = 0.; k <= end; ++k) { - int u, v11, v01, v10; - double e; - e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM; - set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); - fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]); - fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); - fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; - sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG - } - // rescale - s[i] = sum; - set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum; - } - { // f[l_query+1] - double sum; - for (k = 1, sum = 0.; k <= l_ref; ++k) { - int u; - set_u(u, bw, l_query, k); - if (u < 3 || u >= bw2*3+3) continue; - sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI; - } - s[l_query+1] = sum; // the last scaling factor - } - { // compute likelihood - double p = 1., Pr1 = 0.; - for (i = 0; i <= l_query + 1; ++i) { - p *= s[i]; - if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; - } - Pr1 += -4.343 * log(p * l_ref * l_query); - Pr = (int)(Pr1 + .499); - if (!is_backward) { // skip backward and MAP - for (i = 0; i <= l_query; ++i) free(f[i]); - free(f); free(s); free(_qual); - return Pr; - } - } - /*** backward ***/ - // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) - for (k = 1; k <= l_ref; ++k) { - int u; - double *bi = b[l_query]; - set_u(u, bw, l_query, k); - if (u < 3 || u >= bw2*3+3) continue; - bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; - } - // b[l_query-1..1] - for (i = l_query - 1; i >= 1; --i) { - int beg = 1, end = l_ref, x, _beg, _end; - double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1]; - uint8_t qyi1 = query[i+1]; - x = i - bw; beg = beg > x? beg : x; - x = i + bw; end = end < x? end : x; - for (k = end; k >= beg; --k) { - int u, v11, v01, v10; - double e; - set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); - e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11]; - bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. - bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; - bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; -// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG - } - // rescale - set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; - } - { // b[0] - int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; - double sum = 0.; - for (k = end; k >= beg; --k) { - int u; - double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; - set_u(u, bw, 1, k); - if (u < 3 || u >= bw2*3+3) continue; - sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI; - } - set_u(k, bw, 0, 0); - pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0 - } - is_diff = fabs(pb - 1.) > 1e-7? 1 : 0; - /*** MAP ***/ - for (i = 1; i <= l_query; ++i) { - double sum = 0., *fi = f[i], *bi = b[i], max = 0.; - int beg = 1, end = l_ref, x, max_k = -1; - x = i - bw; beg = beg > x? beg : x; - x = i + bw; end = end < x? end : x; - for (k = beg; k <= end; ++k) { - int u; - double z; - set_u(u, bw, i, k); - z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; - z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; - } - max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 - if (state) state[i-1] = max_k; - if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; -#ifdef _MAIN - fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2, - "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG -#endif - } - /*** free ***/ - for (i = 0; i <= l_query; ++i) { - free(f[i]); free(b[i]); - } - free(f); free(b); free(s); free(_qual); - return Pr; -} - -#ifdef _MAIN -#include -int main(int argc, char *argv[]) -{ - uint8_t conv[256], *iqual, *ref, *query; - int c, l_ref, l_query, i, q = 30, b = 10, P; - while ((c = getopt(argc, argv, "b:q:")) >= 0) { - switch (c) { - case 'b': b = atoi(optarg); break; - case 'q': q = atoi(optarg); break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: %s [-q %d] [-b %d] \n", argv[0], q, b); // example: acttc attc - return 1; - } - memset(conv, 4, 256); - conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1; - conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3; - ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1]; - l_ref = strlen((char*)ref); l_query = strlen((char*)query); - for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]]; - for (i = 0; i < l_query; ++i) query[i] = conv[query[i]]; - iqual = malloc(l_query); - memset(iqual, q, l_query); - kpa_par_def.bw = b; - P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0); - fprintf(stderr, "%d\n", P); - free(iqual); - return 0; -} -#endif diff --git a/samtools/kprobaln.c.pysam.c b/samtools/kprobaln.c.pysam.c deleted file mode 100644 index 630b730..0000000 --- a/samtools/kprobaln.c.pysam.c +++ /dev/null @@ -1,284 +0,0 @@ -#include "pysam.h" - -/* The MIT License - - Copyright (C) 2003-2006, 2008-2010 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include - -#include -#include -#include -#include -#include -#include "kprobaln.h" - -/***************************************** - * Probabilistic banded glocal alignment * - *****************************************/ - -#define EI .25 -#define EM .33333333333 - -static float g_qual2prob[256]; - -#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; } - -kpa_par_t kpa_par_def = { 0.001, 0.1, 10 }; -kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 }; - -/* - The topology of the profile HMM: - - /\ /\ /\ /\ - I[1] I[k-1] I[k] I[L] - ^ \ \ ^ \ ^ \ \ ^ - | \ \ | \ | \ \ | - M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1] - \ \/ \/ \/ / - \ /\ /\ /\ / - -> D[k-1] -> D[k] -> - - M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1]. - - On input, _ref is the reference sequence and _query is the query - sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an - ambiguous residue. iqual is the base quality. c sets the gap open - probability, gap extension probability and band width. - - On output, state and q are arrays of length l_query. The higher 30 - bits give the reference position the query base is matched to and the - lower two bits can be 0 (an alignment match) or 1 (an - insertion). q[i] gives the phred scaled posterior probability of - state[i] being wrong. - */ -int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, - const kpa_par_t *c, int *state, uint8_t *q) -{ - double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb; - float *qual, *_qual; - const uint8_t *ref, *query; - int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr; - - if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault - - /*** initialization ***/ - is_backward = state && q? 1 : 0; - ref = _ref - 1; query = _query - 1; // change to 1-based coordinate - bw = l_ref > l_query? l_ref : l_query; - if (bw > c->bw) bw = c->bw; - if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query); - bw2 = bw * 2 + 1; - // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] - f = calloc(l_query+1, sizeof(double*)); - if (is_backward) b = calloc(l_query+1, sizeof(double*)); - for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0 - f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs - if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double)); - } - s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow - // initialize qual - _qual = calloc(l_query, sizeof(float)); - if (g_qual2prob[0] == 0) - for (i = 0; i < 256; ++i) - g_qual2prob[i] = pow(10, -i/10.); - for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30]; - qual = _qual - 1; - // initialize transition probability - sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof - m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); - m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; - m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; - bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 - /*** forward ***/ - // f[0] - set_u(k, bw, 0, 0); - f[0][k] = s[0] = 1.; - { // f[1] - double *fi = f[1], sum; - int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end; - for (k = beg, sum = 0.; k <= end; ++k) { - int u; - double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; - set_u(u, bw, 1, k); - fi[u+0] = e * bM; fi[u+1] = EI * bI; - sum += fi[u] + fi[u+1]; - } - // rescale - s[1] = sum; - set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2; - for (k = _beg; k <= _end; ++k) fi[k] /= sum; - } - // f[2..l_query] - for (i = 2; i <= l_query; ++i) { - double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i]; - int beg = 1, end = l_ref, x, _beg, _end; - uint8_t qyi = query[i]; - x = i - bw; beg = beg > x? beg : x; // band start - x = i + bw; end = end < x? end : x; // band end - for (k = beg, sum = 0.; k <= end; ++k) { - int u, v11, v01, v10; - double e; - e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM; - set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); - fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]); - fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); - fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; - sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(pysam_stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG - } - // rescale - s[i] = sum; - set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum; - } - { // f[l_query+1] - double sum; - for (k = 1, sum = 0.; k <= l_ref; ++k) { - int u; - set_u(u, bw, l_query, k); - if (u < 3 || u >= bw2*3+3) continue; - sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI; - } - s[l_query+1] = sum; // the last scaling factor - } - { // compute likelihood - double p = 1., Pr1 = 0.; - for (i = 0; i <= l_query + 1; ++i) { - p *= s[i]; - if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; - } - Pr1 += -4.343 * log(p * l_ref * l_query); - Pr = (int)(Pr1 + .499); - if (!is_backward) { // skip backward and MAP - for (i = 0; i <= l_query; ++i) free(f[i]); - free(f); free(s); free(_qual); - return Pr; - } - } - /*** backward ***/ - // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) - for (k = 1; k <= l_ref; ++k) { - int u; - double *bi = b[l_query]; - set_u(u, bw, l_query, k); - if (u < 3 || u >= bw2*3+3) continue; - bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; - } - // b[l_query-1..1] - for (i = l_query - 1; i >= 1; --i) { - int beg = 1, end = l_ref, x, _beg, _end; - double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1]; - uint8_t qyi1 = query[i+1]; - x = i - bw; beg = beg > x? beg : x; - x = i + bw; end = end < x? end : x; - for (k = end; k >= beg; --k) { - int u, v11, v01, v10; - double e; - set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); - e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11]; - bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. - bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; - bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; -// fprintf(pysam_stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG - } - // rescale - set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; - } - { // b[0] - int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; - double sum = 0.; - for (k = end; k >= beg; --k) { - int u; - double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; - set_u(u, bw, 1, k); - if (u < 3 || u >= bw2*3+3) continue; - sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI; - } - set_u(k, bw, 0, 0); - pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0 - } - is_diff = fabs(pb - 1.) > 1e-7? 1 : 0; - /*** MAP ***/ - for (i = 1; i <= l_query; ++i) { - double sum = 0., *fi = f[i], *bi = b[i], max = 0.; - int beg = 1, end = l_ref, x, max_k = -1; - x = i - bw; beg = beg > x? beg : x; - x = i + bw; end = end < x? end : x; - for (k = beg; k <= end; ++k) { - int u; - double z; - set_u(u, bw, i, k); - z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; - z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; - } - max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 - if (state) state[i-1] = max_k; - if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; -#ifdef _MAIN - fprintf(pysam_stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2, - "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG -#endif - } - /*** free ***/ - for (i = 0; i <= l_query; ++i) { - free(f[i]); free(b[i]); - } - free(f); free(b); free(s); free(_qual); - return Pr; -} - -#ifdef _MAIN -#include -int samtools_kprobaln_main(int argc, char *argv[]) -{ - uint8_t conv[256], *iqual, *ref, *query; - int c, l_ref, l_query, i, q = 30, b = 10, P; - while ((c = getopt(argc, argv, "b:q:")) >= 0) { - switch (c) { - case 'b': b = atoi(optarg); break; - case 'q': q = atoi(optarg); break; - } - } - if (optind + 2 > argc) { - fprintf(pysam_stderr, "Usage: %s [-q %d] [-b %d] \n", argv[0], q, b); // example: acttc attc - return 1; - } - memset(conv, 4, 256); - conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1; - conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3; - ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1]; - l_ref = strlen((char*)ref); l_query = strlen((char*)query); - for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]]; - for (i = 0; i < l_query; ++i) query[i] = conv[query[i]]; - iqual = malloc(l_query); - memset(iqual, q, l_query); - kpa_par_def.bw = b; - P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0); - fprintf(pysam_stderr, "%d\n", P); - free(iqual); - return 0; -} -#endif diff --git a/samtools/kprobaln.h b/samtools/kprobaln.h deleted file mode 100644 index 50ae77b..0000000 --- a/samtools/kprobaln.h +++ /dev/null @@ -1,49 +0,0 @@ -/* The MIT License - - Copyright (C) 2003-2006, 2008-2010 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef LH3_KPROBALN_H_ -#define LH3_KPROBALN_H_ - -#include - -typedef struct { - float d, e; - int bw; -} kpa_par_t; - -#ifdef __cplusplus -extern "C" { -#endif - - int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, - const kpa_par_t *c, int *state, uint8_t *q); - -#ifdef __cplusplus -} -#endif - -extern kpa_par_t kpa_par_def, kpa_par_alt; - -#endif diff --git a/samtools/misc/ace2sam.c b/samtools/misc/ace2sam.c index 77b9993..19727eb 100644 --- a/samtools/misc/ace2sam.c +++ b/samtools/misc/ace2sam.c @@ -161,7 +161,10 @@ int main(int argc, char *argv[]) } if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); ks_getuntil(ks, '\n', &s, &dret); // skip the empty line - if (write_cns) puts(t[4].s); t[4].l = 0; + if (write_cns) { + if (t[4].l) puts(t[4].s); + t[4].l = 0; + } } else if (strcmp(s.s, "AF") == 0) { // padded read position int reversed, neg, pos; if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); diff --git a/samtools/misc/ace2sam.c.pysam.c b/samtools/misc/ace2sam.c.pysam.c index a663399..02d2f58 100644 --- a/samtools/misc/ace2sam.c.pysam.c +++ b/samtools/misc/ace2sam.c.pysam.c @@ -163,7 +163,10 @@ int samtools_ace2sam_main(int argc, char *argv[]) } if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); ks_getuntil(ks, '\n', &s, &dret); // skip the empty line - if (write_cns) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); t[4].l = 0; + if (write_cns) { + if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); + t[4].l = 0; + } } else if (strcmp(s.s, "AF") == 0) { // padded read position int reversed, neg, pos; if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); diff --git a/samtools/padding.c b/samtools/padding.c index cea79cf..2f10e86 100644 --- a/samtools/padding.c +++ b/samtools/padding.c @@ -491,7 +491,7 @@ int main_pad2unpad(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'), { NULL, 0, NULL, 0 } }; @@ -603,7 +603,7 @@ static int usage(int is_long_help) fprintf(stderr, " Padded reference sequence file [null]\n"); fprintf(stderr, " -o FILE Output file name [stdout]\n"); fprintf(stderr, " -? Longer help\n"); - sam_global_opt_help(stderr, "-...-"); + sam_global_opt_help(stderr, "-...--"); if (is_long_help) fprintf(stderr, diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c index 9f85c95..a3461e4 100644 --- a/samtools/padding.c.pysam.c +++ b/samtools/padding.c.pysam.c @@ -493,7 +493,7 @@ int main_pad2unpad(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'), { NULL, 0, NULL, 0 } }; @@ -605,7 +605,7 @@ static int usage(int is_long_help) fprintf(pysam_stderr, " Padded reference sequence file [null]\n"); fprintf(pysam_stderr, " -o FILE Output file name [pysam_stdout]\n"); fprintf(pysam_stderr, " -? Longer help\n"); - sam_global_opt_help(pysam_stderr, "-...-"); + sam_global_opt_help(pysam_stderr, "-...--"); if (is_long_help) fprintf(pysam_stderr, diff --git a/samtools/phase.c b/samtools/phase.c index 6909912..584334d 100644 --- a/samtools/phase.c +++ b/samtools/phase.c @@ -31,9 +31,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/hts.h" #include "htslib/sam.h" #include "htslib/kstring.h" -#include "errmod.h" #include "sam_opts.h" #include "samtools.h" @@ -580,7 +580,7 @@ int main_phase(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), { NULL, 0, NULL, 0 } }; @@ -620,7 +620,7 @@ int main_phase(int argc, char *argv[]) // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); fprintf(stderr, "\n"); - sam_global_opt_help(stderr, "-...."); + sam_global_opt_help(stderr, "-....-"); return 1; } diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index 3babd37..4226c03 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -33,9 +33,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/hts.h" #include "htslib/sam.h" #include "htslib/kstring.h" -#include "errmod.h" #include "sam_opts.h" #include "samtools.h" @@ -582,7 +582,7 @@ int main_phase(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), { NULL, 0, NULL, 0 } }; @@ -622,7 +622,7 @@ int main_phase(int argc, char *argv[]) // fprintf(pysam_stderr, " -e do not discover SNPs (effective with -l)\n"); fprintf(pysam_stderr, "\n"); - sam_global_opt_help(pysam_stderr, "-...."); + sam_global_opt_help(pysam_stderr, "-....-"); return 1; } diff --git a/samtools/sam.h b/samtools/sam.h index 5130105..6545e64 100644 --- a/samtools/sam.h +++ b/samtools/sam.h @@ -50,7 +50,7 @@ typedef struct { samFile *file; struct { BGZF *bam; } x; // Hack so that fp->x.bam still works bam_hdr_t *header; - short is_write:1; + unsigned short is_write:1; } samfile_t; #ifdef __cplusplus diff --git a/samtools/sam_opts.c b/samtools/sam_opts.c index 9369145..9e7a8de 100644 --- a/samtools/sam_opts.c +++ b/samtools/sam_opts.c @@ -72,6 +72,9 @@ int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt, r |= hts_opt_add((hts_opt **)&ga->out.specific, ref); free(ref); break; + } else if (strcmp(lopt->name, "threads") == 0) { + ga->nthreads = atoi(optarg); + break; // } else if (strcmp(lopt->name, "verbose") == 0) { // ga->verbosity++; // break; @@ -100,7 +103,7 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) { int i = 0; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0), + SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0,0), { NULL, 0, NULL, 0 } }; @@ -130,6 +133,9 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) { else if (strcmp(lopts[i].name, "reference") == 0) fprintf(fp,"reference FILE\n" " Reference sequence FASTA FILE [null]\n"); + else if (strcmp(lopts[i].name, "threads") == 0) + fprintf(fp,"threads INT\n" + " Number of additional threads to use [0]\n"); // else if (strcmp(lopts[i].name, "verbose") == 0) // fprintf(fp,"verbose\n" // " Increment level of verbosity\n"); diff --git a/samtools/sam_opts.c.pysam.c b/samtools/sam_opts.c.pysam.c index d0b56a3..aed4869 100644 --- a/samtools/sam_opts.c.pysam.c +++ b/samtools/sam_opts.c.pysam.c @@ -74,6 +74,9 @@ int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt, r |= hts_opt_add((hts_opt **)&ga->out.specific, ref); free(ref); break; + } else if (strcmp(lopt->name, "threads") == 0) { + ga->nthreads = atoi(optarg); + break; // } else if (strcmp(lopt->name, "verbose") == 0) { // ga->verbosity++; // break; @@ -102,7 +105,7 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) { int i = 0; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0), + SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0,0), { NULL, 0, NULL, 0 } }; @@ -132,6 +135,9 @@ void sam_global_opt_help(FILE *fp, const char *shortopts) { else if (strcmp(lopts[i].name, "reference") == 0) fprintf(fp,"reference FILE\n" " Reference sequence FASTA FILE [null]\n"); + else if (strcmp(lopts[i].name, "threads") == 0) + fprintf(fp,"threads INT\n" + " Number of additional threads to use [0]\n"); // else if (strcmp(lopts[i].name, "verbose") == 0) // fprintf(fp,"verbose\n" // " Increment level of verbosity\n"); diff --git a/samtools/sam_opts.h b/samtools/sam_opts.h index 25e9279..6edbf64 100644 --- a/samtools/sam_opts.h +++ b/samtools/sam_opts.h @@ -34,6 +34,7 @@ typedef struct sam_global_args { htsFormat in; htsFormat out; char *reference; + int nthreads; //int verbosity; } sam_global_args; @@ -45,6 +46,7 @@ enum { SAM_OPT_OUTPUT_FMT, SAM_OPT_OUTPUT_FMT_OPTION, SAM_OPT_REFERENCE, + SAM_OPT_NTHREADS, //SAM_OPT_VERBOSE }; @@ -56,12 +58,13 @@ enum { // 0 No short option has been assigned. Use --long-opt only. // '-' Both long and short options are disabled. // Otherwise the equivalent short option is character . -#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5) \ +#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5, o6) \ {"input-fmt", required_argument, NULL, SAM_OPT_VAL(o1, SAM_OPT_INPUT_FMT)}, \ {"input-fmt-option", required_argument, NULL, SAM_OPT_VAL(o2, SAM_OPT_INPUT_FMT_OPTION)}, \ {"output-fmt", required_argument, NULL, SAM_OPT_VAL(o3, SAM_OPT_OUTPUT_FMT)}, \ {"output-fmt-option", required_argument, NULL, SAM_OPT_VAL(o4, SAM_OPT_OUTPUT_FMT_OPTION)}, \ - {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)} + {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}, \ + {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)} //{"verbose", no_argument, NULL, SAM_OPT_VERBOSE} /* diff --git a/samtools/sam_utils.c b/samtools/sam_utils.c new file mode 100644 index 0000000..4f8964a --- /dev/null +++ b/samtools/sam_utils.c @@ -0,0 +1,60 @@ +/* sam_utils.c -- various utilities internal to samtools. + + Copyright (C) 2014-2016 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include + +#include "samtools.h" + +static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) +{ + fflush(stdout); + if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand); + else fprintf(stderr, "samtools: "); + vfprintf(stderr, format, args); + if (extra) fprintf(stderr, ": %s\n", extra); + else fprintf(stderr, "\n"); + fflush(stderr); +} + +void print_error(const char *subcommand, const char *format, ...) +{ + va_list args; + va_start(args, format); + vprint_error_core(subcommand, format, args, NULL); + va_end(args); +} + +void print_error_errno(const char *subcommand, const char *format, ...) +{ + int err = errno; + va_list args; + va_start(args, format); + vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); + va_end(args); +} diff --git a/samtools/sam_utils.c.pysam.c b/samtools/sam_utils.c.pysam.c new file mode 100644 index 0000000..0a78619 --- /dev/null +++ b/samtools/sam_utils.c.pysam.c @@ -0,0 +1,62 @@ +#include "pysam.h" + +/* sam_utils.c -- various utilities internal to samtools. + + Copyright (C) 2014-2016 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include + +#include "samtools.h" + +static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) +{ + fflush(pysam_stdout); + if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand); + else fprintf(pysam_stderr, "samtools: "); + vfprintf(pysam_stderr, format, args); + if (extra) fprintf(pysam_stderr, ": %s\n", extra); + else fprintf(pysam_stderr, "\n"); + fflush(pysam_stderr); +} + +void print_error(const char *subcommand, const char *format, ...) +{ + va_list args; + va_start(args, format); + vprint_error_core(subcommand, format, args, NULL); + va_end(args); +} + +void print_error_errno(const char *subcommand, const char *format, ...) +{ + int err = errno; + va_list args; + va_start(args, format); + vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); + va_end(args); +} diff --git a/samtools/sam_view.c b/samtools/sam_view.c index 402e1d3..9c2d15b 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -1,6 +1,6 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2015 Genome Research Ltd. + Copyright (C) 2009-2017 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -34,12 +35,18 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/sam.h" #include "htslib/faidx.h" #include "htslib/kstring.h" #include "htslib/khash.h" +#include "htslib/thread_pool.h" #include "samtools.h" #include "sam_opts.h" + +#define DEFAULT_BARCODE_TAG "BC" +#define DEFAULT_QUALITY_TAG "QT" + KHASH_SET_INIT_STR(rg) typedef khash_t(rg) *rghash_t; @@ -50,6 +57,7 @@ typedef struct samview_settings { int min_mapQ; int flag_on; int flag_off; + int flag_alloff; int min_qlen; int remove_B; uint32_t subsam_seed; @@ -83,6 +91,8 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; + if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) + return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { @@ -231,19 +241,22 @@ static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; - int is_long_help = 0, n_threads = 0; + int is_long_help = 0; int64_t count = 0; samFile *in = 0, *out = 0, *un_out=0; + FILE *fp_out = NULL; bam_hdr_t *header = NULL; char out_mode[5], out_un_mode[5], *out_format = ""; char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool p = {NULL, 0}; samview_settings_t settings = { .rghash = NULL, .min_mapQ = 0, .flag_on = 0, .flag_off = 0, + .flag_alloff = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, @@ -253,8 +266,7 @@ int main_samview(int argc, char *argv[]) }; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'), - { "threads", required_argument, NULL, '@' }, + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), { NULL, 0, NULL, 0 } }; @@ -262,11 +274,13 @@ int main_samview(int argc, char *argv[]) strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:", lopts, NULL)) >= 0) { switch (c) { case 's': if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { + // Convert likely user input 0,1,2,... to pseudo-random + // values with more entropy and more bits set srand(settings.subsam_seed); settings.subsam_seed = rand(); } @@ -284,6 +298,7 @@ int main_samview(int argc, char *argv[]) case 'U': fn_un_out = strdup(optarg); break; case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; + case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; @@ -313,7 +328,6 @@ int main_samview(int argc, char *argv[]) */ case '?': is_long_help = 1; break; case 'B': settings.remove_B = 1; break; - case '@': n_threads = strtol(optarg, 0, 0); break; case 'x': { if (strlen(optarg) != 2) { @@ -425,8 +439,26 @@ int main_samview(int argc, char *argv[]) } } } + else { + if (fn_out) { + fp_out = fopen(fn_out, "w"); + if (fp_out == NULL) { + print_error_errno("view", "can't create \"%s\"", fn_out); + ret = EXIT_FAILURE; + goto view_end; + } + } + } - if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } + if (ga.nthreads > 1) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(stderr, "Error creating thread pool\n"); + ret = 1; + goto view_end; + } + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + } if (is_header_only) goto view_end; // no need to print alignments if (optind + 1 >= argc) { // convert/print the entire file @@ -487,13 +519,19 @@ int main_samview(int argc, char *argv[]) } view_end: - if (is_count && ret == 0) - printf("%" PRId64 "\n", count); + if (is_count && ret == 0) { + if (fprintf(fn_out? fp_out : stdout, "%" PRId64 "\n", count) < 0) { + if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out); + else print_error_errno("view", "writing to standard output failed"); + ret = EXIT_FAILURE; + } + } // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); + if (fp_out) fclose(fp_out); free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); @@ -508,6 +546,10 @@ view_end: if (settings.remove_aux_len) { free(settings.remove_aux); } + + if (p.pool) + hts_tpool_destroy(p.pool); + return ret; } @@ -538,20 +580,19 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -l STR only include reads in library STR [null]\n" " -m INT only include reads with number of CIGAR operations consuming\n" " query sequence >= INT [0]\n" -" -f INT only include reads with all bits set in INT set in FLAG [0]\n" -" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" +" fraction of templates/read pairs to keep; INT part sets seed)\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" -" -s FLOAT integer part sets seed of random number generator [0];\n" -" rest sets fraction of templates to subsample [no subsampling]\n" // general options -" -@, --threads INT\n" -" number of BAM/CRAM compression threads [0]\n" " -? print long help, including note about region specification\n" " -S ignored (input format is auto-detected)\n"); - sam_global_opt_help(fp, "-.O.T"); + sam_global_opt_help(fp, "-.O.T@"); fprintf(fp, "\n"); if (is_long_help) @@ -620,21 +661,37 @@ static void bam2fq_usage(FILE *to, const char *command) "Usage: samtools %s [options...] \n", command); fprintf(to, "Options:\n" -" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n" -" -1 FILE write paired reads flagged READ1 to FILE\n" -" -2 FILE write paired reads flagged READ2 to FILE\n" -" -f INT only include reads with all bits set in INT set in FLAG [0]\n" -" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" -" -n don't append /1 and /2 to the read name\n"); +" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n" +" -1 FILE write paired reads flagged READ1 to FILE\n" +" -2 FILE write paired reads flagged READ2 to FILE\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -n don't append /1 and /2 to the read name\n" +" -N always append /1 and /2 to the read name\n"); if (fq) fprintf(to, -" -O output quality in the OQ tag if present\n"); +" -O output quality in the OQ tag if present\n"); fprintf(to, -" -s FILE write singleton reads to FILE [assume single-end]\n" -" -t copy RG, BC and QT tags to the %s header line\n", +" -s FILE write singleton reads to FILE [assume single-end]\n" +" -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -v INT default quality score if not given in file [1]\n"); - sam_global_opt_help(to, "-.--."); +" -v INT default quality score if not given in file [1]\n" +" --i1 FILE write first index reads to FILE\n" +" --i2 FILE write second index reads to FILE\n" +" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" +" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" +" --index-format STR How to parse barcode and quality tags\n\n"); + sam_global_opt_help(to, "-.--.@"); + fprintf(to, +" \n" +" The index-format string describes how to parse the barcode and quality tags, for example:\n" +" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" +" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" +" If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" +" 'read until the separator or end of tag', for example:\n" +" n*i* ignore the left part of the tag until the separator, then use the second part\n" +" of the tag as index 1\n"); } typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; @@ -643,24 +700,97 @@ typedef struct bam2fq_opts { char *fnse; char *fnr[3]; char *fn_input; // pointer to input filename in argv do not free - bool has12, use_oq, copy_tags; - int flag_on, flag_off; + bool has12, has12always, use_oq, copy_tags; + int flag_on, flag_off, flag_alloff; sam_global_args ga; fastfile filetype; int def_qual; + char *barcode_tag; + char *quality_tag; + char *index_file[2]; + char *index_format; } bam2fq_opts_t; typedef struct bam2fq_state { samFile *fp; FILE *fpse; FILE *fpr[3]; + FILE *fpi[2]; bam_hdr_t *h; bool has12, use_oq, copy_tags; - int flag_on, flag_off; + int flag_on, flag_off, flag_alloff; fastfile filetype; int def_qual; } bam2fq_state_t; +/* + * Get and decode the read from a BAM record. + * + * TODO: htslib really needs an interface for this. Consider this or perhaps + * bam_get_seq_str (current vs original orientation) and bam_get_qual_str + * functions as string formatted equivalents to bam_get_{seq,qual}? + */ + +/* + * Reverse a string in place. + * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. + * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik + */ +static char *reverse(char *str) +{ + int i = strlen(str)-1,j=0; + char ch; + while (i>j) { + ch = str[i]; + str[i]= str[j]; + str[j] = ch; + i--; + j++; + } + return str; +} + +/* return the read, reverse complemented if necessary */ +static char *get_read(const bam1_t *rec) +{ + int len = rec->core.l_qseq + 1; + char *read = calloc(1, len); + char *seq = (char *)bam_get_seq(rec); + int n; + + if (!read) return NULL; + + for (n=0; n < rec->core.l_qseq; n++) { + if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; + else read[n] = seq_nt16_str[bam_seqi(seq,n)]; + } + if (rec->core.flag & BAM_FREVERSE) reverse(read); + return read; +} + +/* + * get and decode the quality from a BAM record + */ +static char *get_quality(const bam1_t *rec) +{ + char *quality = calloc(1, rec->core.l_qseq + 1); + char *q = (char *)bam_get_qual(rec); + int n; + + if (*q == '\xff') { free(quality); return NULL; } + + for (n=0; n < rec->core.l_qseq; n++) { + quality[n] = q[n]+33; + } + if (rec->core.flag & BAM_FREVERSE) reverse(quality); + return quality; +} + +// +// End of htslib complaints +// + + static readpart which_readpart(const bam1_t *b) { if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { @@ -672,85 +802,60 @@ static readpart which_readpart(const bam1_t *b) } } -// Transform a bam1_t record into a string with the FASTQ representation of it -// @returns false for error, true for success -static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) +/* + * parse the length part from the index-format string + */ +static int getLength(char **s) { - int i; - int32_t qlen = b->core.l_qseq; - assert(qlen >= 0); - uint8_t *seq; - uint8_t *qual = bam_get_qual(b); - const uint8_t *oq = NULL; - if (state->use_oq) { - oq = bam_aux_get(b, "OQ"); - if (oq) oq++; // skip tag type + int n = 0; + while (**s) { + if (**s == '*') { n=-1; (*s)++; break; } + if ( !isdigit(**s)) break; + n = n*10 + ((**s)-'0'); + (*s)++; } - bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality + return n; +} + +static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) +{ + int i; linebuf->l = 0; // Write read name - readpart readpart = which_readpart(b); kputc(state->filetype == FASTA? '>' : '@', linebuf); - kputs(bam_get_qname(b), linebuf); + kputs(bam_get_qname(rec), linebuf); // Add the /1 /2 if requested if (state->has12) { + readpart readpart = which_readpart(rec); if (readpart == READ_1) kputs("/1", linebuf); else if (readpart == READ_2) kputs("/2", linebuf); } if (state->copy_tags) { for (i = 0; copied_tags[i]; ++i) { uint8_t *s; - if ((s = bam_aux_get(b, copied_tags[i])) != 0) { - kputc('\t', linebuf); - kputsn(copied_tags[i], 2, linebuf); - kputsn(":Z:", 3, linebuf); - kputs(bam_aux2Z(s), linebuf); + if ((s = bam_aux_get(rec, copied_tags[i])) != 0) { + if (*s == 'Z') { + kputc('\t', linebuf); + kputsn(copied_tags[i], 2, linebuf); + kputsn(":Z:", 3, linebuf); + kputs(bam_aux2Z(s), linebuf); + } } } } kputc('\n', linebuf); - - seq = bam_get_seq(b); - - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - for (i = qlen-1; i > -1; --i) { - char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]]; - kputc(c, linebuf); - } - } else { - for (i = 0; i < qlen; ++i) { - char c = seq_nt16_str[bam_seqi(seq,i)]; - kputc(c, linebuf); - } - } + kputs(seq, linebuf); kputc('\n', linebuf); if (state->filetype == FASTQ) { // Write quality kputs("+\n", linebuf); - if (has_qual) { - if (state->use_oq && oq) { - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - for (i = qlen-1; i > -1; --i) { - kputc(oq[i], linebuf); - } - } else { - kputs((char*)oq, linebuf); - } - } else { - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - for (i = qlen-1; i > -1; --i) { - kputc(33 + qual[i], linebuf); - } - } else { - for (i = 0; i < qlen; ++i) { - kputc(33 + qual[i], linebuf); - } - } - } + if (qual && *qual) { + kputs(qual, linebuf); } else { - for (i = 0; i < qlen; ++i) { + int len = strlen(seq); + for (i = 0; i < len; ++i) { kputc(33 + state->def_qual, linebuf); } } @@ -759,49 +864,214 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t return true; } +/* + * Create FASTQ lines from the barcode tag using the index-format + */ +static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts) +{ + uint8_t *p; + char *ifmt = opts->index_format; + char *tag = NULL; + char *qual = NULL; + int file_number = 0; + kstring_t linebuf = { 0, 0, NULL }; // Buffer + + // read barcode tag + p = bam_aux_get(rec,opts->barcode_tag); + if (p) tag = bam_aux2Z(p); + + if (!tag) return true; // there is no tag + + // read quality tag + p = bam_aux_get(rec, opts->quality_tag); + if (p) qual = bam_aux2Z(p); + + // Parse the index-format string + while (*ifmt) { + if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly + char action = *ifmt; // should be 'i' or 'n' + ifmt++; // skip over action + int index_len = getLength(&ifmt); + + char *sub_tag = calloc(1, strlen(tag)+1); + char *sub_qual = calloc(1, strlen(tag)+1); + int n = 0; + + if (index_len < 0) { + // read until separator + while (isalpha(*tag)) { + sub_tag[n] = *tag++; + if (qual) sub_qual[n] = *qual++; + n++; + } + if (*tag) { // skip separator + tag++; + if (qual) qual++; + } + } else { + // read index_len characters + while (index_len-- && *tag) { + sub_tag[n] = *tag++; + if (qual) sub_qual[n] = *qual++; + n++; + } + } + + if (action=='i' && *sub_tag && state->fpi[file_number]) { + make_fq_line(rec, sub_tag, sub_qual, &linebuf, state); + fputs(linebuf.s, state->fpi[file_number++]); + } + free(sub_qual); free(sub_tag); + + } + + free(linebuf.s); + return true; +} + +// Transform a bam1_t record into a string with the FASTQ representation of it +// @returns false for error, true for success +static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) +{ + int32_t qlen = b->core.l_qseq; + assert(qlen >= 0); + const uint8_t *oq = NULL; + char *qual = NULL; + + char *seq = get_read(b); + + if (state->use_oq) { + oq = bam_aux_get(b, "OQ"); + if (oq) { + oq++; + qual = strdup(bam_aux2Z(oq)); + if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented + reverse(qual); + } + } + } else { + qual = get_quality(b); + } + + make_fq_line(b, seq, qual, linebuf, state); + + free(qual); + free(seq); + return true; +} + +static void free_opts(bam2fq_opts_t *opts) +{ + free(opts->barcode_tag); + free(opts->quality_tag); + free(opts->index_format); + free(opts); +} + // return true if valid static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) { // Parse args bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); opts->has12 = true; + opts->has12always = false; opts->filetype = FASTQ; opts->def_qual = 1; + opts->barcode_tag = NULL; + opts->quality_tag = NULL; + opts->index_format = NULL; + opts->index_file[0] = NULL; + opts->index_file[1] = NULL; int c; sam_global_args_init(&opts->ga); static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), + {"i1", required_argument, NULL, 1}, + {"I1", required_argument, NULL, 1}, + {"i2", required_argument, NULL, 2}, + {"I2", required_argument, NULL, 2}, + {"if", required_argument, NULL, 3}, + {"IF", required_argument, NULL, 3}, + {"index-format", required_argument, NULL, 3}, + {"barcode-tag", required_argument, NULL, 'b'}, + {"quality-tag", required_argument, NULL, 'q'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) { + while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) { switch (c) { + case 'b': opts->barcode_tag = strdup(optarg); break; + case 'q': opts->quality_tag = strdup(optarg); break; + case 1 : opts->index_file[0] = optarg; break; + case 2 : opts->index_file[1] = optarg; break; + case 3 : opts->index_format = strdup(optarg); break; case '0': opts->fnr[0] = optarg; break; case '1': opts->fnr[1] = optarg; break; case '2': opts->fnr[2] = optarg; break; case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; case 'F': opts->flag_off |= strtol(optarg, 0, 0); break; + case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; + case 'N': opts->has12always = true; break; case 'O': opts->use_oq = true; break; case 's': opts->fnse = optarg; break; case 't': opts->copy_tags = true; break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(stderr, argv[0]); free(opts); return false; + case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(stderr, argv[0]); free(opts); return false; + bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; } break; } } if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; + if (opts->has12always) opts->has12 = true; + + if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); + if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); + + int nIndex = 0; + if (opts->index_format) { + char *s; + for (s = opts->index_format; *s; s++) { + if (*s == 'i') nIndex++; + } + } + if (nIndex>2) { + fprintf(stderr,"Invalid index format: more than 2 indexes\n"); + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; + } + + if (opts->index_file[1] && !opts->index_file[0]) { + fprintf(stderr, "Index one specified, but index two not given\n"); + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; + } + + if (nIndex==2 && !opts->index_file[1]) { + fprintf(stderr, "index_format specifies two indexes, but only one index file given\n"); + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; + } + + if (nIndex==1 && !opts->index_file[0]) { + fprintf(stderr, "index_format specifies an index, but no index file given\n"); + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; + } if (opts->def_qual < 0 || 93 < opts->def_qual) { fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); bam2fq_usage(stderr, argv[0]); - free(opts); - return true; + free_opts(opts); + return false; } const char* type_str = argv[0]; @@ -812,20 +1082,21 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) } else { print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); bam2fq_usage(stderr, argv[0]); - free(opts); + free_opts(opts); return false; } if ((argc - (optind)) == 0) { + fprintf(stderr, "No input file specified.\n"); bam2fq_usage(stdout, argv[0]); - free(opts); + free_opts(opts); return false; } if ((argc - (optind)) != 1) { fprintf(stderr, "Too many arguments.\n"); bam2fq_usage(stderr, argv[0]); - free(opts); + free_opts(opts); return false; } opts->fn_input = argv[optind]; @@ -838,6 +1109,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; + state->flag_alloff = opts->flag_alloff; state->has12 = opts->has12; state->use_oq = opts->use_oq; state->copy_tags = opts->copy_tags; @@ -850,6 +1122,8 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) free(state); return false; } + if (opts->ga.nthreads > 0) + hts_set_threads(state->fp, opts->ga.nthreads); uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; if (opts->use_oq) rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { @@ -884,6 +1158,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->fpr[i] = stdout; } } + for (i = 0; i < 2; i++) { + state->fpi[i] = NULL; + if (opts->index_file[i]) { + state->fpi[i] = fopen(opts->index_file[i], "w"); + if (state->fpi[i] == NULL) { + print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]); + free(state); + return false; + } + } + } state->h = sam_hdr_read(state->fp); if (state->h == NULL) { @@ -906,6 +1191,12 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* for (i = 0; i < 3; ++i) { if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } + for (i = 0; i < 2; i++) { + if (state->fpi[i] && fclose(state->fpi[i])) { + print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); + valid = false; + } + } free(state); return valid; } @@ -914,11 +1205,12 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) { return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags - || (b->core.flag&(state->flag_off)) != 0); + || (b->core.flag&(state->flag_off)) != 0 + || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); } -static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) +static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts) { bam1_t* b = bam_init1(); char *current_qname = NULL; @@ -974,6 +1266,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) return false; } score[which_readpart(b)] = b_score; + if (state->fpi[0]) tags2fq(b, state, opts); } } if (!valid) @@ -991,7 +1284,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) return valid; } -static bool bam2fq_mainloop(bam2fq_state_t *state) +static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) { // process a name collated BAM into fastq bam1_t* b = bam_init1(); @@ -1002,13 +1295,12 @@ static bool bam2fq_mainloop(bam2fq_state_t *state) int64_t n_reads = 0; // Statistics kstring_t linebuf = { 0, 0, NULL }; // Buffer while (sam_read1(state->fp, state->h, b) >= 0) { - if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments - || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags - || (b->core.flag&(state->flag_off)) != 0) continue; + if (filter_it_out(b, state)) continue; ++n_reads; if (!bam1_to_fq(b, &linebuf, state)) return false; fputs(linebuf.s, state->fpr[which_readpart(b)]); + if (state->fpi[0]) tags2fq(b, state, opts); } free(linebuf.s); bam_destroy1(b); @@ -1029,14 +1321,14 @@ int main_bam2fq(int argc, char *argv[]) if (!init_state(opts, &state)) return EXIT_FAILURE; if (state->fpse) { - if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE; + if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE; } else { - if (!bam2fq_mainloop(state)) status = EXIT_FAILURE; + if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; } if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; sam_global_args_free(&opts->ga); - free(opts); + free_opts(opts); return status; } diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 8c883b0..6df47c9 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -2,7 +2,7 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2015 Genome Research Ltd. + Copyright (C) 2009-2017 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -36,12 +37,18 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/sam.h" #include "htslib/faidx.h" #include "htslib/kstring.h" #include "htslib/khash.h" +#include "htslib/thread_pool.h" #include "samtools.h" #include "sam_opts.h" + +#define DEFAULT_BARCODE_TAG "BC" +#define DEFAULT_QUALITY_TAG "QT" + KHASH_SET_INIT_STR(rg) typedef khash_t(rg) *rghash_t; @@ -52,6 +59,7 @@ typedef struct samview_settings { int min_mapQ; int flag_on; int flag_off; + int flag_alloff; int min_qlen; int remove_B; uint32_t subsam_seed; @@ -85,6 +93,8 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; + if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) + return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { @@ -233,19 +243,22 @@ static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; - int is_long_help = 0, n_threads = 0; + int is_long_help = 0; int64_t count = 0; samFile *in = 0, *out = 0, *un_out=0; + FILE *fp_out = NULL; bam_hdr_t *header = NULL; char out_mode[5], out_un_mode[5], *out_format = ""; char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool p = {NULL, 0}; samview_settings_t settings = { .rghash = NULL, .min_mapQ = 0, .flag_on = 0, .flag_off = 0, + .flag_alloff = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, @@ -255,8 +268,7 @@ int main_samview(int argc, char *argv[]) }; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'), - { "threads", required_argument, NULL, '@' }, + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), { NULL, 0, NULL, 0 } }; @@ -264,11 +276,13 @@ int main_samview(int argc, char *argv[]) strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:", lopts, NULL)) >= 0) { switch (c) { case 's': if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { + // Convert likely user input 0,1,2,... to pseudo-random + // values with more entropy and more bits set srand(settings.subsam_seed); settings.subsam_seed = rand(); } @@ -286,6 +300,7 @@ int main_samview(int argc, char *argv[]) case 'U': fn_un_out = strdup(optarg); break; case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; + case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; @@ -315,7 +330,6 @@ int main_samview(int argc, char *argv[]) */ case '?': is_long_help = 1; break; case 'B': settings.remove_B = 1; break; - case '@': n_threads = strtol(optarg, 0, 0); break; case 'x': { if (strlen(optarg) != 2) { @@ -427,8 +441,26 @@ int main_samview(int argc, char *argv[]) } } } + else { + if (fn_out) { + fp_out = fopen(fn_out, "w"); + if (fp_out == NULL) { + print_error_errno("view", "can't create \"%s\"", fn_out); + ret = EXIT_FAILURE; + goto view_end; + } + } + } - if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } + if (ga.nthreads > 1) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(pysam_stderr, "Error creating thread pool\n"); + ret = 1; + goto view_end; + } + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + } if (is_header_only) goto view_end; // no need to print alignments if (optind + 1 >= argc) { // convert/print the entire file @@ -489,13 +521,19 @@ int main_samview(int argc, char *argv[]) } view_end: - if (is_count && ret == 0) - fprintf(pysam_stdout, "%" PRId64 "\n", count); - + if (is_count && ret == 0) { + if (fprintf(fn_out? fp_out : pysam_stdout, "%" PRId64 "\n", count) < 0) { + if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out); + else print_error_errno("view", "writing to standard output failed"); + ret = EXIT_FAILURE; + } + } + // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); + if (fp_out) fclose(fp_out); free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); @@ -510,6 +548,10 @@ view_end: if (settings.remove_aux_len) { free(settings.remove_aux); } + + if (p.pool) + hts_tpool_destroy(p.pool); + return ret; } @@ -540,20 +582,19 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -l STR only include reads in library STR [null]\n" " -m INT only include reads with number of CIGAR operations consuming\n" " query sequence >= INT [0]\n" -" -f INT only include reads with all bits set in INT set in FLAG [0]\n" -" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" +" fraction of templates/read pairs to keep; INT part sets seed)\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" -" -s FLOAT integer part sets seed of random number generator [0];\n" -" rest sets fraction of templates to subsample [no subsampling]\n" // general options -" -@, --threads INT\n" -" number of BAM/CRAM compression threads [0]\n" " -? print long help, including note about region specification\n" " -S ignored (input format is auto-detected)\n"); - sam_global_opt_help(fp, "-.O.T"); + sam_global_opt_help(fp, "-.O.T@"); fprintf(fp, "\n"); if (is_long_help) @@ -622,21 +663,37 @@ static void bam2fq_usage(FILE *to, const char *command) "Usage: samtools %s [options...] \n", command); fprintf(to, "Options:\n" -" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n" -" -1 FILE write paired reads flagged READ1 to FILE\n" -" -2 FILE write paired reads flagged READ2 to FILE\n" -" -f INT only include reads with all bits set in INT set in FLAG [0]\n" -" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" -" -n don't append /1 and /2 to the read name\n"); +" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n" +" -1 FILE write paired reads flagged READ1 to FILE\n" +" -2 FILE write paired reads flagged READ2 to FILE\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -n don't append /1 and /2 to the read name\n" +" -N always append /1 and /2 to the read name\n"); if (fq) fprintf(to, -" -O output quality in the OQ tag if present\n"); +" -O output quality in the OQ tag if present\n"); fprintf(to, -" -s FILE write singleton reads to FILE [assume single-end]\n" -" -t copy RG, BC and QT tags to the %s header line\n", +" -s FILE write singleton reads to FILE [assume single-end]\n" +" -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -v INT default quality score if not given in file [1]\n"); - sam_global_opt_help(to, "-.--."); +" -v INT default quality score if not given in file [1]\n" +" --i1 FILE write first index reads to FILE\n" +" --i2 FILE write second index reads to FILE\n" +" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" +" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" +" --index-format STR How to parse barcode and quality tags\n\n"); + sam_global_opt_help(to, "-.--.@"); + fprintf(to, +" \n" +" The index-format string describes how to parse the barcode and quality tags, for example:\n" +" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" +" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" +" If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" +" 'read until the separator or end of tag', for example:\n" +" n*i* ignore the left part of the tag until the separator, then use the second part\n" +" of the tag as index 1\n"); } typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; @@ -645,24 +702,97 @@ typedef struct bam2fq_opts { char *fnse; char *fnr[3]; char *fn_input; // pointer to input filename in argv do not free - bool has12, use_oq, copy_tags; - int flag_on, flag_off; + bool has12, has12always, use_oq, copy_tags; + int flag_on, flag_off, flag_alloff; sam_global_args ga; fastfile filetype; int def_qual; + char *barcode_tag; + char *quality_tag; + char *index_file[2]; + char *index_format; } bam2fq_opts_t; typedef struct bam2fq_state { samFile *fp; FILE *fpse; FILE *fpr[3]; + FILE *fpi[2]; bam_hdr_t *h; bool has12, use_oq, copy_tags; - int flag_on, flag_off; + int flag_on, flag_off, flag_alloff; fastfile filetype; int def_qual; } bam2fq_state_t; +/* + * Get and decode the read from a BAM record. + * + * TODO: htslib really needs an interface for this. Consider this or perhaps + * bam_get_seq_str (current vs original orientation) and bam_get_qual_str + * functions as string formatted equivalents to bam_get_{seq,qual}? + */ + +/* + * Reverse a string in place. + * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. + * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik + */ +static char *reverse(char *str) +{ + int i = strlen(str)-1,j=0; + char ch; + while (i>j) { + ch = str[i]; + str[i]= str[j]; + str[j] = ch; + i--; + j++; + } + return str; +} + +/* return the read, reverse complemented if necessary */ +static char *get_read(const bam1_t *rec) +{ + int len = rec->core.l_qseq + 1; + char *read = calloc(1, len); + char *seq = (char *)bam_get_seq(rec); + int n; + + if (!read) return NULL; + + for (n=0; n < rec->core.l_qseq; n++) { + if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; + else read[n] = seq_nt16_str[bam_seqi(seq,n)]; + } + if (rec->core.flag & BAM_FREVERSE) reverse(read); + return read; +} + +/* + * get and decode the quality from a BAM record + */ +static char *get_quality(const bam1_t *rec) +{ + char *quality = calloc(1, rec->core.l_qseq + 1); + char *q = (char *)bam_get_qual(rec); + int n; + + if (*q == '\xff') { free(quality); return NULL; } + + for (n=0; n < rec->core.l_qseq; n++) { + quality[n] = q[n]+33; + } + if (rec->core.flag & BAM_FREVERSE) reverse(quality); + return quality; +} + +// +// End of htslib complaints +// + + static readpart which_readpart(const bam1_t *b) { if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { @@ -674,85 +804,60 @@ static readpart which_readpart(const bam1_t *b) } } -// Transform a bam1_t record into a string with the FASTQ representation of it -// @returns false for error, true for success -static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) +/* + * parse the length part from the index-format string + */ +static int getLength(char **s) { - int i; - int32_t qlen = b->core.l_qseq; - assert(qlen >= 0); - uint8_t *seq; - uint8_t *qual = bam_get_qual(b); - const uint8_t *oq = NULL; - if (state->use_oq) { - oq = bam_aux_get(b, "OQ"); - if (oq) oq++; // skip tag type + int n = 0; + while (**s) { + if (**s == '*') { n=-1; (*s)++; break; } + if ( !isdigit(**s)) break; + n = n*10 + ((**s)-'0'); + (*s)++; } - bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality + return n; +} + +static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) +{ + int i; linebuf->l = 0; // Write read name - readpart readpart = which_readpart(b); kputc(state->filetype == FASTA? '>' : '@', linebuf); - kputs(bam_get_qname(b), linebuf); + kputs(bam_get_qname(rec), linebuf); // Add the /1 /2 if requested if (state->has12) { + readpart readpart = which_readpart(rec); if (readpart == READ_1) kputs("/1", linebuf); else if (readpart == READ_2) kputs("/2", linebuf); } if (state->copy_tags) { for (i = 0; copied_tags[i]; ++i) { uint8_t *s; - if ((s = bam_aux_get(b, copied_tags[i])) != 0) { - kputc('\t', linebuf); - kputsn(copied_tags[i], 2, linebuf); - kputsn(":Z:", 3, linebuf); - kputs(bam_aux2Z(s), linebuf); + if ((s = bam_aux_get(rec, copied_tags[i])) != 0) { + if (*s == 'Z') { + kputc('\t', linebuf); + kputsn(copied_tags[i], 2, linebuf); + kputsn(":Z:", 3, linebuf); + kputs(bam_aux2Z(s), linebuf); + } } } } kputc('\n', linebuf); - - seq = bam_get_seq(b); - - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - for (i = qlen-1; i > -1; --i) { - char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]]; - kputc(c, linebuf); - } - } else { - for (i = 0; i < qlen; ++i) { - char c = seq_nt16_str[bam_seqi(seq,i)]; - kputc(c, linebuf); - } - } + kputs(seq, linebuf); kputc('\n', linebuf); if (state->filetype == FASTQ) { // Write quality kputs("+\n", linebuf); - if (has_qual) { - if (state->use_oq && oq) { - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - for (i = qlen-1; i > -1; --i) { - kputc(oq[i], linebuf); - } - } else { - kputs((char*)oq, linebuf); - } - } else { - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - for (i = qlen-1; i > -1; --i) { - kputc(33 + qual[i], linebuf); - } - } else { - for (i = 0; i < qlen; ++i) { - kputc(33 + qual[i], linebuf); - } - } - } + if (qual && *qual) { + kputs(qual, linebuf); } else { - for (i = 0; i < qlen; ++i) { + int len = strlen(seq); + for (i = 0; i < len; ++i) { kputc(33 + state->def_qual, linebuf); } } @@ -761,49 +866,214 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t return true; } +/* + * Create FASTQ lines from the barcode tag using the index-format + */ +static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts) +{ + uint8_t *p; + char *ifmt = opts->index_format; + char *tag = NULL; + char *qual = NULL; + int file_number = 0; + kstring_t linebuf = { 0, 0, NULL }; // Buffer + + // read barcode tag + p = bam_aux_get(rec,opts->barcode_tag); + if (p) tag = bam_aux2Z(p); + + if (!tag) return true; // there is no tag + + // read quality tag + p = bam_aux_get(rec, opts->quality_tag); + if (p) qual = bam_aux2Z(p); + + // Parse the index-format string + while (*ifmt) { + if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly + char action = *ifmt; // should be 'i' or 'n' + ifmt++; // skip over action + int index_len = getLength(&ifmt); + + char *sub_tag = calloc(1, strlen(tag)+1); + char *sub_qual = calloc(1, strlen(tag)+1); + int n = 0; + + if (index_len < 0) { + // read until separator + while (isalpha(*tag)) { + sub_tag[n] = *tag++; + if (qual) sub_qual[n] = *qual++; + n++; + } + if (*tag) { // skip separator + tag++; + if (qual) qual++; + } + } else { + // read index_len characters + while (index_len-- && *tag) { + sub_tag[n] = *tag++; + if (qual) sub_qual[n] = *qual++; + n++; + } + } + + if (action=='i' && *sub_tag && state->fpi[file_number]) { + make_fq_line(rec, sub_tag, sub_qual, &linebuf, state); + fputs(linebuf.s, state->fpi[file_number++]); + } + free(sub_qual); free(sub_tag); + + } + + free(linebuf.s); + return true; +} + +// Transform a bam1_t record into a string with the FASTQ representation of it +// @returns false for error, true for success +static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) +{ + int32_t qlen = b->core.l_qseq; + assert(qlen >= 0); + const uint8_t *oq = NULL; + char *qual = NULL; + + char *seq = get_read(b); + + if (state->use_oq) { + oq = bam_aux_get(b, "OQ"); + if (oq) { + oq++; + qual = strdup(bam_aux2Z(oq)); + if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented + reverse(qual); + } + } + } else { + qual = get_quality(b); + } + + make_fq_line(b, seq, qual, linebuf, state); + + free(qual); + free(seq); + return true; +} + +static void free_opts(bam2fq_opts_t *opts) +{ + free(opts->barcode_tag); + free(opts->quality_tag); + free(opts->index_format); + free(opts); +} + // return true if valid static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) { // Parse args bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); opts->has12 = true; + opts->has12always = false; opts->filetype = FASTQ; opts->def_qual = 1; + opts->barcode_tag = NULL; + opts->quality_tag = NULL; + opts->index_format = NULL; + opts->index_file[0] = NULL; + opts->index_file[1] = NULL; int c; sam_global_args_init(&opts->ga); static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), + {"i1", required_argument, NULL, 1}, + {"I1", required_argument, NULL, 1}, + {"i2", required_argument, NULL, 2}, + {"I2", required_argument, NULL, 2}, + {"if", required_argument, NULL, 3}, + {"IF", required_argument, NULL, 3}, + {"index-format", required_argument, NULL, 3}, + {"barcode-tag", required_argument, NULL, 'b'}, + {"quality-tag", required_argument, NULL, 'q'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) { + while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) { switch (c) { + case 'b': opts->barcode_tag = strdup(optarg); break; + case 'q': opts->quality_tag = strdup(optarg); break; + case 1 : opts->index_file[0] = optarg; break; + case 2 : opts->index_file[1] = optarg; break; + case 3 : opts->index_format = strdup(optarg); break; case '0': opts->fnr[0] = optarg; break; case '1': opts->fnr[1] = optarg; break; case '2': opts->fnr[2] = optarg; break; case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; case 'F': opts->flag_off |= strtol(optarg, 0, 0); break; + case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; + case 'N': opts->has12always = true; break; case 'O': opts->use_oq = true; break; case 's': opts->fnse = optarg; break; case 't': opts->copy_tags = true; break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false; + case '?': bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false; + bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false; } break; } } if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; + if (opts->has12always) opts->has12 = true; + + if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); + if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); + + int nIndex = 0; + if (opts->index_format) { + char *s; + for (s = opts->index_format; *s; s++) { + if (*s == 'i') nIndex++; + } + } + if (nIndex>2) { + fprintf(pysam_stderr,"Invalid index format: more than 2 indexes\n"); + bam2fq_usage(pysam_stderr, argv[0]); + free_opts(opts); + return false; + } + + if (opts->index_file[1] && !opts->index_file[0]) { + fprintf(pysam_stderr, "Index one specified, but index two not given\n"); + bam2fq_usage(pysam_stderr, argv[0]); + free_opts(opts); + return false; + } + + if (nIndex==2 && !opts->index_file[1]) { + fprintf(pysam_stderr, "index_format specifies two indexes, but only one index file given\n"); + bam2fq_usage(pysam_stderr, argv[0]); + free_opts(opts); + return false; + } + + if (nIndex==1 && !opts->index_file[0]) { + fprintf(pysam_stderr, "index_format specifies an index, but no index file given\n"); + bam2fq_usage(pysam_stderr, argv[0]); + free_opts(opts); + return false; + } if (opts->def_qual < 0 || 93 < opts->def_qual) { fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); bam2fq_usage(pysam_stderr, argv[0]); - free(opts); - return true; + free_opts(opts); + return false; } const char* type_str = argv[0]; @@ -814,20 +1084,21 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) } else { print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); bam2fq_usage(pysam_stderr, argv[0]); - free(opts); + free_opts(opts); return false; } if ((argc - (optind)) == 0) { + fprintf(pysam_stderr, "No input file specified.\n"); bam2fq_usage(pysam_stdout, argv[0]); - free(opts); + free_opts(opts); return false; } if ((argc - (optind)) != 1) { fprintf(pysam_stderr, "Too many arguments.\n"); bam2fq_usage(pysam_stderr, argv[0]); - free(opts); + free_opts(opts); return false; } opts->fn_input = argv[optind]; @@ -840,6 +1111,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; + state->flag_alloff = opts->flag_alloff; state->has12 = opts->has12; state->use_oq = opts->use_oq; state->copy_tags = opts->copy_tags; @@ -852,6 +1124,8 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) free(state); return false; } + if (opts->ga.nthreads > 0) + hts_set_threads(state->fp, opts->ga.nthreads); uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; if (opts->use_oq) rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { @@ -886,6 +1160,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->fpr[i] = pysam_stdout; } } + for (i = 0; i < 2; i++) { + state->fpi[i] = NULL; + if (opts->index_file[i]) { + state->fpi[i] = fopen(opts->index_file[i], "w"); + if (state->fpi[i] == NULL) { + print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]); + free(state); + return false; + } + } + } state->h = sam_hdr_read(state->fp); if (state->h == NULL) { @@ -908,6 +1193,12 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* for (i = 0; i < 3; ++i) { if (state->fpr[i] != pysam_stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } + for (i = 0; i < 2; i++) { + if (state->fpi[i] && fclose(state->fpi[i])) { + print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); + valid = false; + } + } free(state); return valid; } @@ -916,11 +1207,12 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) { return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags - || (b->core.flag&(state->flag_off)) != 0); + || (b->core.flag&(state->flag_off)) != 0 + || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); } -static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) +static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts) { bam1_t* b = bam_init1(); char *current_qname = NULL; @@ -976,6 +1268,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) return false; } score[which_readpart(b)] = b_score; + if (state->fpi[0]) tags2fq(b, state, opts); } } if (!valid) @@ -993,7 +1286,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) return valid; } -static bool bam2fq_mainloop(bam2fq_state_t *state) +static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) { // process a name collated BAM into fastq bam1_t* b = bam_init1(); @@ -1004,13 +1297,12 @@ static bool bam2fq_mainloop(bam2fq_state_t *state) int64_t n_reads = 0; // Statistics kstring_t linebuf = { 0, 0, NULL }; // Buffer while (sam_read1(state->fp, state->h, b) >= 0) { - if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments - || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags - || (b->core.flag&(state->flag_off)) != 0) continue; + if (filter_it_out(b, state)) continue; ++n_reads; if (!bam1_to_fq(b, &linebuf, state)) return false; fputs(linebuf.s, state->fpr[which_readpart(b)]); + if (state->fpi[0]) tags2fq(b, state, opts); } free(linebuf.s); bam_destroy1(b); @@ -1031,14 +1323,14 @@ int main_bam2fq(int argc, char *argv[]) if (!init_state(opts, &state)) return EXIT_FAILURE; if (state->fpse) { - if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE; + if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE; } else { - if (!bam2fq_mainloop(state)) status = EXIT_FAILURE; + if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; } if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; sam_global_args_free(&opts->ga); - free(opts); + free_opts(opts); return status; } diff --git a/samtools/stats.c b/samtools/stats.c index eb6bb52..35574ed 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -828,8 +828,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) // reads. Mates mapped to different chromosomes have isize==0. int32_t isize = bam_line->core.isize; if ( isize<0 ) isize = -isize; - if ( stats->info->nisize > 0 && isize >= stats->info->nisize ) - isize = stats->info->nisize-1; + if ( stats->info->nisize > 0 && isize > stats->info->nisize ) + isize = stats->info->nisize; if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) { int pos_fst = bam_line->core.mpos - bam_line->core.pos; @@ -1263,7 +1263,7 @@ void init_regions(stats_t *stats, const char *file) stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); } - if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); + if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); if ( prev_tid==-1 || prev_tid!=tid ) { prev_tid = tid; @@ -1375,7 +1375,7 @@ static void error(const char *format, ...) printf(" -S, --split Also write statistics to separate files split by tagged field.\n"); printf(" -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); - sam_global_opt_help(stdout, "-.--."); + sam_global_opt_help(stdout, "-.--.@"); printf("\n"); } else @@ -1481,13 +1481,13 @@ int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFor // .. bam samFile* sam; if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) { - error("Failed to open: %s\n", bam_fname); + print_error_errno("stats", "failed to open \"%s\"", bam_fname); return 1; } info->sam = sam; info->sam_header = sam_hdr_read(sam); if (info->sam_header == NULL) { - error("Failed to read header for '%s'\n", bam_fname); + print_error("stats", "failed to read header for \"%s\"", bam_fname); return 1; } return 0; @@ -1537,7 +1537,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); - stats->isize = init_isize_t(info->nisize); + stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0); stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t)); @@ -1596,7 +1596,7 @@ int main_stats(int argc, char *argv[]) static const struct option loptions[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), {"help", no_argument, NULL, 'h'}, {"remove-dups", no_argument, NULL, 'd'}, {"sam", no_argument, NULL, 's'}, @@ -1618,7 +1618,7 @@ int main_stats(int argc, char *argv[]) }; int opt; - while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 ) + while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:@:",loptions,NULL))>0 ) { switch (opt) { @@ -1662,6 +1662,8 @@ int main_stats(int argc, char *argv[]) } if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1; + if (ga.nthreads > 0) + hts_set_threads(info->sam, ga.nthreads); stats_t *all_stats = stats_init(); stats_t *curr_stats = NULL; diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index da187ac..8ebb52a 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -220,7 +220,7 @@ typedef struct stats_t; KHASH_MAP_INIT_STR(c2stats, stats_t*) -static void error(const char *format, ...); +static int error(const char *format, ...); int is_in_regions(bam1_t *bam_line, stats_t *stats); void realloc_buffers(stats_t *stats, int seq_len); @@ -830,8 +830,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) // reads. Mates mapped to different chromosomes have isize==0. int32_t isize = bam_line->core.isize; if ( isize<0 ) isize = -isize; - if ( stats->info->nisize > 0 && isize >= stats->info->nisize ) - isize = stats->info->nisize-1; + if ( stats->info->nisize > 0 && isize > stats->info->nisize ) + isize = stats->info->nisize; if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) { int pos_fst = bam_line->core.mpos - bam_line->core.pos; @@ -1265,7 +1265,7 @@ void init_regions(stats_t *stats, const char *file) stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); } - if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); + if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); if ( prev_tid==-1 || prev_tid!=tid ) { prev_tid = tid; @@ -1352,7 +1352,7 @@ void init_group_id(stats_t *stats, const char *id) } -static void error(const char *format, ...) +static int error(const char *format, ...) { if ( !format ) { @@ -1377,8 +1377,9 @@ static void error(const char *format, ...) fprintf(pysam_stdout, " -S, --split Also write statistics to separate files split by tagged field.\n"); fprintf(pysam_stdout, " -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); fprintf(pysam_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); - sam_global_opt_help(pysam_stdout, "-.--."); + sam_global_opt_help(pysam_stdout, "-.--.@"); fprintf(pysam_stdout, "\n"); + return(0); } else { @@ -1483,13 +1484,13 @@ int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFor // .. bam samFile* sam; if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) { - error("Failed to open: %s\n", bam_fname); + print_error_errno("stats", "failed to open \"%s\"", bam_fname); return 1; } info->sam = sam; info->sam_header = sam_hdr_read(sam); if (info->sam_header == NULL) { - error("Failed to read header for '%s'\n", bam_fname); + print_error("stats", "failed to read header for \"%s\"", bam_fname); return 1; } return 0; @@ -1539,7 +1540,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); - stats->isize = init_isize_t(info->nisize); + stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0); stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t)); @@ -1598,7 +1599,7 @@ int main_stats(int argc, char *argv[]) static const struct option loptions[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), {"help", no_argument, NULL, 'h'}, {"remove-dups", no_argument, NULL, 'd'}, {"sam", no_argument, NULL, 's'}, @@ -1620,7 +1621,7 @@ int main_stats(int argc, char *argv[]) }; int opt; - while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 ) + while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:@:",loptions,NULL))>0 ) { switch (opt) { @@ -1646,7 +1647,7 @@ int main_stats(int argc, char *argv[]) case 'S': info->split_tag = optarg; break; case 'P': info->split_prefix = optarg; break; case '?': - case 'h': error(NULL); + case 'h': return(error(NULL)); default: if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0) error("Unknown argument: %s\n", optarg); @@ -1659,11 +1660,13 @@ int main_stats(int argc, char *argv[]) if ( !bam_fname ) { if ( isatty(STDIN_FILENO) ) - error(NULL); + return(error(NULL)); bam_fname = "-"; } if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1; + if (ga.nthreads > 0) + hts_set_threads(info->sam, ga.nthreads); stats_t *all_stats = stats_init(); stats_t *curr_stats = NULL; diff --git a/samtools/test/split/test_filter_header_rg.c b/samtools/test/split/test_filter_header_rg.c index d9505d6..cccf0e9 100644 --- a/samtools/test/split/test_filter_header_rg.c +++ b/samtools/test/split/test_filter_header_rg.c @@ -42,7 +42,8 @@ void setup_test_1(bam_hdr_t** hdr_in) bool check_test_1(const bam_hdr_t* hdr) { const char *test1_res = "@HD\tVN:1.4\n" - "@SQ\tSN:blah\n"; + "@SQ\tSN:blah\n" + "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; if (strcmp(hdr->text, test1_res)) { return false; @@ -65,7 +66,8 @@ bool check_test_2(const bam_hdr_t* hdr) { const char *test2_res = "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" - "@RG\tID:fish\n"; + "@RG\tID:fish\n" + "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; if (strcmp(hdr->text, test2_res)) { return false; @@ -73,7 +75,7 @@ bool check_test_2(const bam_hdr_t* hdr) { return true; } -int main(int argc, char**argv) +int main(int argc, char *argv[]) { // test state const int NUM_TESTS = 2; @@ -82,6 +84,8 @@ int main(int argc, char**argv) int failure = 0; int getopt_char; + char *test_argv[] = { "test_filter_header_rg", "foo\tbar", "baz" }; + char *arg_list = stringify_argv(3, test_argv); while ((getopt_char = getopt(argc, argv, "v")) != -1) { switch (getopt_char) { case 'v': @@ -116,7 +120,7 @@ int main(int argc, char**argv) // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe - bool result_1 = filter_header_rg(hdr1, id_to_keep_1); + bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); fclose(stderr); if (verbose) printf("END RUN test 1\n"); @@ -155,7 +159,7 @@ int main(int argc, char**argv) // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe - bool result_2 = filter_header_rg(hdr2, id_to_keep_2); + bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); fclose(stderr); if (verbose) printf("END RUN test 2\n"); @@ -185,6 +189,7 @@ int main(int argc, char**argv) // Cleanup free(res.s); + free(arg_list); remove(tempfname); if (failure > 0) fprintf(orig_stderr, "%d failures %d successes\n", failure, success); diff --git a/samtools/test/split/test_filter_header_rg.c.pysam.c b/samtools/test/split/test_filter_header_rg.c.pysam.c index 97b3573..c9284f6 100644 --- a/samtools/test/split/test_filter_header_rg.c.pysam.c +++ b/samtools/test/split/test_filter_header_rg.c.pysam.c @@ -44,7 +44,8 @@ void setup_test_1(bam_hdr_t** hdr_in) bool check_test_1(const bam_hdr_t* hdr) { const char *test1_res = "@HD\tVN:1.4\n" - "@SQ\tSN:blah\n"; + "@SQ\tSN:blah\n" + "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; if (strcmp(hdr->text, test1_res)) { return false; @@ -67,7 +68,8 @@ bool check_test_2(const bam_hdr_t* hdr) { const char *test2_res = "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" - "@RG\tID:fish\n"; + "@RG\tID:fish\n" + "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; if (strcmp(hdr->text, test2_res)) { return false; @@ -75,7 +77,7 @@ bool check_test_2(const bam_hdr_t* hdr) { return true; } -int samtools_test_filter_header_rg_main(int argc, char**argv) +int samtools_test_filter_header_rg_main(int argc, char *argv[]) { // test state const int NUM_TESTS = 2; @@ -84,6 +86,8 @@ int samtools_test_filter_header_rg_main(int argc, char**argv) int failure = 0; int getopt_char; + char *test_argv[] = { "test_filter_header_rg", "foo\tbar", "baz" }; + char *arg_list = stringify_argv(3, test_argv); while ((getopt_char = getopt(argc, argv, "v")) != -1) { switch (getopt_char) { case 'v': @@ -118,7 +122,7 @@ int samtools_test_filter_header_rg_main(int argc, char**argv) // test xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe - bool result_1 = filter_header_rg(hdr1, id_to_keep_1); + bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); fclose(pysam_stderr); if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); @@ -157,7 +161,7 @@ int samtools_test_filter_header_rg_main(int argc, char**argv) // test xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe - bool result_2 = filter_header_rg(hdr2, id_to_keep_2); + bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); fclose(pysam_stderr); if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); @@ -187,6 +191,7 @@ int samtools_test_filter_header_rg_main(int argc, char**argv) // Cleanup free(res.s); + free(arg_list); remove(tempfname); if (failure > 0) fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); diff --git a/samtools/test/test.c b/samtools/test/test.c index 7ab38af..fb0b549 100644 --- a/samtools/test/test.c +++ b/samtools/test/test.c @@ -1,6 +1,6 @@ /* test/test.c -- test harness utility routines. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014, 2016 Genome Research Ltd. Author: Martin O. Pollard @@ -53,3 +53,9 @@ void dump_hdr(const bam_hdr_t* hdr) } printf("text: \"%s\"\n", hdr->text); } + +// For tests, just return a constant that can be embedded in expected output. +const char *samtools_version(void) +{ + return "x.y.test"; +} diff --git a/samtools/test/test.c.pysam.c b/samtools/test/test.c.pysam.c index a8295b5..bf460e8 100644 --- a/samtools/test/test.c.pysam.c +++ b/samtools/test/test.c.pysam.c @@ -2,7 +2,7 @@ /* test/test.c -- test harness utility routines. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014, 2016 Genome Research Ltd. Author: Martin O. Pollard @@ -55,3 +55,9 @@ void dump_hdr(const bam_hdr_t* hdr) } fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text); } + +// For tests, just return a constant that can be embedded in expected output. +const char *samtools_version(void) +{ + return "x.y.test"; +} diff --git a/samtools/version.h b/samtools/version.h index ec46e67..004d7ed 100644 --- a/samtools/version.h +++ b/samtools/version.h @@ -1 +1 @@ -#define SAMTOOLS_VERSION "1.3.1" +#define SAMTOOLS_VERSION "1.4.1" diff --git a/setup.py b/setup.py index 6d52617..5b23d20 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ This module provides a low-level wrapper around the htslib C-API as using cython and a high-level API for convenient access to the data within standard genomic file formats. -The current version wraps htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1. +The current version wraps htslib-1.4.1, samtools-1.4.1 and bcftools-1.4.1. See: http://www.htslib.org @@ -78,6 +78,11 @@ def configure_library(library_dir, env_options=None, options=[]): configure_script = os.path.join(library_dir, "configure") + on_rtd = os.environ.get("READTHEDOCS") == "True" + # RTD has no bzip2 development libraries installed: + if on_rtd: + env_options = "--disable-bz2" + if not os.path.exists(configure_script): raise ValueError( "configure script {} does not exist".format(configure_script)) @@ -246,8 +251,8 @@ elif HTSLIB_MODE == 'shared': # htslib built from sources included in the pysam # package. htslib_library_dirs = [ - 'pysam', - ".", + "pysam", # when using setup.py develop? + ".", # when using setup.py develop? os.path.join("build", distutils_dir_name("lib"), "pysam")] htslib_include_dirs = ['htslib'] @@ -255,7 +260,15 @@ elif HTSLIB_MODE == 'shared': else: raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE) -internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]] +suffix = sysconfig.get_config_var('EXT_SUFFIX') +if not suffix: + suffix = sysconfig.get_config_var('SO') +internal_htslib_libraries = [os.path.splitext("chtslib{}".format(suffix))[0]] + +internal_tools_libraries = [ + os.path.splitext("csamtools{}".format(suffix))[0], + os.path.splitext("cbcftools{}".format(suffix))[0], + ] # build config.py with open(os.path.join("pysam", "config.py"), "w") as outf: @@ -268,7 +281,7 @@ with open(os.path.join("pysam", "config.py"), "w") as outf: if line.startswith("#define"): key, value = re.match( "#define (\S+)\s+(\S+)", line).groups() - config_values[key] = int(value) + config_values[key] = value for key in ["ENABLE_PLUGINS", "HAVE_COMMONCRYPTO", "HAVE_GMTIME_R", @@ -353,7 +366,6 @@ chtslib = Extension( shared_htslib_sources + os_c_files, library_dirs=htslib_library_dirs, - runtime_library_dirs=htslib_library_dirs, include_dirs=["pysam", "."] + include_os + htslib_include_dirs, libraries=external_htslib_libraries, language="c", @@ -369,8 +381,7 @@ csamfile = Extension( "pysam.libcsamfile", [source_pattern % "samfile", "pysam/htslib_util.c", - "pysam/samfile_util.c", - "samtools/kprobaln.c"] + + "pysam/samfile_util.c"] + htslib_sources + os_c_files, library_dirs=htslib_library_dirs, @@ -389,8 +400,7 @@ calignmentfile = Extension( "pysam.libcalignmentfile", [source_pattern % "alignmentfile", "pysam/htslib_util.c", - "pysam/samfile_util.c", - "samtools/kprobaln.c"] + + "pysam/samfile_util.c"] + htslib_sources + os_c_files, library_dirs=htslib_library_dirs, @@ -409,8 +419,7 @@ calignedsegment = Extension( "pysam.libcalignedsegment", [source_pattern % "alignedsegment", "pysam/htslib_util.c", - "pysam/samfile_util.c", - "samtools/kprobaln.c"] + + "pysam/samfile_util.c"] + htslib_sources + os_c_files, library_dirs=htslib_library_dirs, @@ -435,17 +444,45 @@ ctabix = Extension( define_macros=define_macros ) + + cutils = Extension( "pysam.libcutils", [source_pattern % "utils", "pysam/pysam_util.c"] + + htslib_sources + + os_c_files, + library_dirs=["pysam"] + htslib_library_dirs, + include_dirs=["pysam", "."] + + include_os + htslib_include_dirs, + libraries=external_htslib_libraries + internal_htslib_libraries + internal_tools_libraries, + language="c", + extra_compile_args=extra_compile_args, + define_macros=define_macros +) + +csamtools = Extension( + "pysam.libcsamtools", + [source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) + - # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) + + htslib_sources + + os_c_files, + library_dirs=["pysam"] + htslib_library_dirs, + include_dirs=["samtools", "pysam", "."] + + include_os + htslib_include_dirs, + libraries=external_htslib_libraries + internal_htslib_libraries, + language="c", + extra_compile_args=extra_compile_args, + define_macros=define_macros +) + +cbcftools = Extension( + "pysam.libcbcftools", + [source_pattern % "bcftools"] + glob.glob(os.path.join("bcftools", "*.pysam.c")) + - # glob.glob(os.path.join("bcftools", "*", "*.pysam.c")) + htslib_sources + os_c_files, library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["samtools", "bcftools", "pysam", "."] + + include_dirs=["bcftools", "pysam", "."] + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", @@ -538,6 +575,8 @@ metadata = { cbcf, cbgzf, cfaidx, + csamtools, + cbcftools, cutils], 'cmdclass': cmdclass, 'package_dir': package_dirs, diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index b0a3466..6d9101c 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -234,20 +234,46 @@ class TestAlignedSegment(ReadTest): def test_infer_query_length(self): '''Test infer_query_length on M|=|X|I|D|H|S cigar ops''' a = self.buildRead() - a.cigarstring = '15M' - self.assertEqual(a.infer_query_length(), 15) - a.cigarstring = '15=' - self.assertEqual(a.infer_query_length(), 15) - a.cigarstring = '15X' - self.assertEqual(a.infer_query_length(), 15) - a.cigarstring = '5M5I5M' - self.assertEqual(a.infer_query_length(), 15) - a.cigarstring = '5M5D5M' - self.assertEqual(a.infer_query_length(), 10) - a.cigarstring = '5H10M' - self.assertEqual(a.infer_query_length(), 15) - a.cigarstring = '5S10M' - self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '40M' + self.assertEqual(a.infer_query_length(), 40) + a.cigarstring = '40=' + self.assertEqual(a.infer_query_length(), 40) + a.cigarstring = '40X' + self.assertEqual(a.infer_query_length(), 40) + a.cigarstring = '20M5I20M' + self.assertEqual(a.infer_query_length(), 45) + a.cigarstring = '20M5D20M' + self.assertEqual(a.infer_query_length(), 40) + a.cigarstring = '5H35M' + self.assertEqual(a.infer_query_length(), 35) + a.cigarstring = '5S35M' + self.assertEqual(a.infer_query_length(), 40) + a.cigarstring = '35M5H' + self.assertEqual(a.infer_query_length(), 35) + a.cigarstring = '35M5S' + self.assertEqual(a.infer_query_length(), 40) + + def test_infer_read_length(self): + '''Test infer_read_length on M|=|X|I|D|H|S cigar ops''' + a = self.buildRead() + a.cigarstring = '40M' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '40=' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '40X' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '20M5I20M' + self.assertEqual(a.infer_read_length(), 45) + a.cigarstring = '20M5D20M' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '5H35M' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '5S35M' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '35M5H' + self.assertEqual(a.infer_read_length(), 40) + a.cigarstring = '35M5S' + self.assertEqual(a.infer_read_length(), 40) def test_get_aligned_pairs_soft_clipping(self): a = self.buildRead() @@ -388,22 +414,28 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.query_alignment_length, 20) a.cigarstring = "20M1S" self.assertEqual(a.query_alignment_length, 20) + a.cigarstring = "20M1H" + self.assertEqual(a.query_alignment_length, 20) a.cigarstring = "1S20M" self.assertEqual(a.query_alignment_length, 20) + a.cigarstring = "1H20M" + self.assertEqual(a.query_alignment_length, 20) a.cigarstring = "1S20M1S" self.assertEqual(a.query_alignment_length, 20) + a.cigarstring = "1H20M1H" + self.assertEqual(a.query_alignment_length, 20) def test_query_length_is_limited(self): a = self.buildRead() a.query_name = "A" * 1 - a.query_name = "A" * 254 + a.query_name = "A" * 251 self.assertRaises( ValueError, setattr, a, "query_name", - "A" * 255) + "A" * 252) class TestCigarStats(ReadTest): @@ -785,5 +817,34 @@ class TestAsString(unittest.TestCase): self.assertEqual(s, p.tostring(pysamf)) +class TestEnums(unittest.TestCase): + + def test_cigar_enums_are_defined(self): + self.assertEqual(pysam.CMATCH, 0) + self.assertEqual(pysam.CINS, 1) + self.assertEqual(pysam.CDEL, 2) + self.assertEqual(pysam.CREF_SKIP, 3) + self.assertEqual(pysam.CSOFT_CLIP, 4) + self.assertEqual(pysam.CHARD_CLIP, 5) + self.assertEqual(pysam.CPAD, 6) + self.assertEqual(pysam.CEQUAL, 7) + self.assertEqual(pysam.CDIFF, 8) + self.assertEqual(pysam.CBACK, 9) + + def test_sam_flags_are_defined(self): + self.assertEqual(pysam.FPAIRED, 1) + self.assertEqual(pysam.FPROPER_PAIR, 2) + self.assertEqual(pysam.FUNMAP, 4) + self.assertEqual(pysam.FMUNMAP, 8) + self.assertEqual(pysam.FREVERSE, 16) + self.assertEqual(pysam.FMREVERSE, 32) + self.assertEqual(pysam.FREAD1, 64) + self.assertEqual(pysam.FREAD2, 128) + self.assertEqual(pysam.FSECONDARY, 256) + self.assertEqual(pysam.FQCFAIL, 512) + self.assertEqual(pysam.FDUP, 1024) + self.assertEqual(pysam.FSUPPLEMENTARY, 2048) + + if __name__ == "__main__": unittest.main() diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 18fb05b..a866881 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -439,10 +439,12 @@ class TestIO(unittest.TestCase): input_filename, reference_filename, output_filename, - input_mode, output_mode, + input_mode, + output_mode, sequence_filename=None, use_template=True, - checkf=checkBinaryEqual): + checkf=checkBinaryEqual, + **kwargs): '''iterate through *input_filename* writing to *output_filename* and comparing the output to *reference_filename*. @@ -477,7 +479,7 @@ class TestIO(unittest.TestCase): output_filename, output_mode, reference_filename=sequence_filename, - template=infile) + template=infile, **kwargs) else: outfile = pysam.AlignmentFile( output_filename, @@ -485,7 +487,8 @@ class TestIO(unittest.TestCase): reference_names=infile.references, reference_lengths=infile.lengths, reference_filename=sequence_filename, - add_sq_text=False) + add_sq_text=False, + **kwargs) iter = infile.fetch() @@ -509,6 +512,13 @@ class TestIO(unittest.TestCase): "tmp_ex2.sam", "r", "wh") + def testSAM2SAMWithoutHeader(self): + self.checkEcho("ex2.sam", + "ex1.sam", + "tmp_ex2.sam", + "r", "w", + add_sam_header=False) + def testBAM2BAM(self): self.checkEcho("ex2.bam", "ex2.bam", @@ -588,14 +598,6 @@ class TestIO(unittest.TestCase): # self.checkEcho(input_filename, reference_filename, output_filename, # "rb", "wb", use_template=False) - # Release 0.8.0 - # no samfiles without header - def testSAM2SAMWithoutHeader(self): - self.checkEcho("ex2.sam", - "ex1.sam", - "tmp_ex2.sam", - "r", "w") - def testReadSamWithoutTargetNames(self): '''see issue 104.''' input_filename = os.path.join( @@ -614,14 +616,12 @@ class TestIO(unittest.TestCase): input_filename, "r", check_header=True) - infile = pysam.AlignmentFile( + with pysam.AlignmentFile( input_filename, check_header=False, - check_sq=False) - - # TODO - # result = list(infile.fetch(until_eof=True)) - # self.assertEqual(2, len(result)) + check_sq=False) as infile: + result = list(infile.fetch(until_eof=True)) + self.assertEqual(2, len(result)) def testReadBamWithoutTargetNames(self): '''see issue 104.''' @@ -641,52 +641,43 @@ class TestIO(unittest.TestCase): "r", check_header=True) - infile = pysam.AlignmentFile( - input_filename, check_header=False, check_sq=False) - result = list(infile.fetch(until_eof=True)) + with pysam.AlignmentFile( + input_filename, check_sq=False) as infile: + result = list(infile.fetch(until_eof=True)) - # TODO - def testReadSamWithoutHeader(self): + def test_fail_read_sam_without_header(self): input_filename = os.path.join(DATADIR, "ex1.sam") - # reading from a samfile without header is not - # implemented self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r") - # TODO - # without check_header header is no read - # leading to segfault - # self.assertRaises(ValueError, - # pysam.AlignmentFile, - # input_filename, - # "r", - # check_header=False) + def test_pass_read_sam_without_header_with_refs(self): + with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.sam"), + "r", + reference_names=["chr1", "chr2"], + reference_lengths=[1575, 1584]) as samfile: + self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270) - # TODO - # def testReadUnformattedFile(self): - # '''test reading from a file that is not bam/sam formatted''' - # input_filename = os.path.join(DATADIR, 'Makefile') - - # # bam - file raise error - # self.assertRaises(ValueError, - # pysam.AlignmentFile, - # input_filename, - # "rb") - - # # sam - file error, but can't fetch - # self.assertRaises(ValueError, - # pysam.AlignmentFile, - # input_filename, - # "r") - - # self.assertRaises(ValueError, - # pysam.AlignmentFile, - # input_filename, - # "r", - # check_header=False) + def test_pass_read_sam_with_header_without_header_check(self): + with pysam.AlignmentFile(os.path.join(DATADIR, "ex2.sam"), + "r", check_header=False) as samfile: + self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270) + + def test_fail_when_reading_unformatted_files(self): + '''test reading from a file that is not bam/sam formatted''' + input_filename = os.path.join(DATADIR, 'Makefile') + + self.assertRaises(ValueError, + pysam.AlignmentFile, + input_filename, + "rb") + + self.assertRaises(ValueError, + pysam.AlignmentFile, + input_filename, + "r") def testBAMWithoutAlignedSegments(self): '''see issue 117''' @@ -854,7 +845,23 @@ class TestIO(unittest.TestCase): check_sq=False) samfile.fetch('chr2') - + def test_fetch_by_tid(self): + with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), "rb") as samfile: + self.assertEqual(len(list(samfile.fetch('chr1'))), + len(list(samfile.fetch(tid=0)))) + self.assertEqual(len(list(samfile.fetch('chr2'))), + len(list(samfile.fetch(tid=1)))) + self.assertRaises( + IndexError, + samfile.fetch, + tid=2) + self.assertRaises( + IndexError, + samfile.fetch, + tid=-1) + self.assertEqual(len(list(samfile.fetch('chr1',start=1000, end=2000))), + len(list(samfile.fetch(tid=0, start=1000, end=2000)))) + class TestAutoDetect(unittest.TestCase): @@ -1761,7 +1768,7 @@ class TestDeNovoConstruction(unittest.TestCase): # os.unlink(tmpfilename) - def testBAMPerRead(self): + def test_pass_if_reads_binary_equal(self): '''check if individual reads are binary equal.''' infile = pysam.AlignmentFile(self.bamfile, "rb") @@ -1846,25 +1853,17 @@ class TestTruncatedBAM(unittest.TestCase): '''see pull request 50.''' - def testTruncatedBam(self): + def testTruncatedBam2(self): + self.assertRaises(IOError, + pysam.AlignmentFile, + os.path.join(DATADIR, 'ex2_truncated.bam')) - s = pysam.AlignmentFile( - os.path.join(DATADIR, 'ex2_truncated.bam')) + def testTruncatedBam2(self): + s = pysam.AlignmentFile(os.path.join(DATADIR, 'ex2_truncated.bam'), + ignore_truncation=True) iterall = lambda x: len([a for a in x]) self.assertRaises(IOError, iterall, s) - def testTruncatedBamFetch(self): - '''See comments for pull request at - https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 - ''' - # Currently there is no way to detect truncated - # files through hts_iter_fetch, so this test is - # disabled - return - s = pysam.AlignmentFile( - os.path.join(DATADIR, 'ex2_truncated.bam')) - iterall = lambda x: len([a for a in x]) - self.assertRaises(IOError, iterall, s.fetch()) COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204, 0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78, diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py deleted file mode 100644 index ff13045..0000000 --- a/tests/SamFile_test.py +++ /dev/null @@ -1,1990 +0,0 @@ -#!/usr/bin/env python -'''unit testing code for pysam. - -Execute in the :file:`tests` directory as it requires the Makefile -and data files located there. -''' - -import pysam -import pysam.samtools -import unittest -import os -import shutil -import sys -import collections -import subprocess -import logging -import array -from TestUtils import checkBinaryEqual, checkURL, force_str - -DATADIR = "pysam_data" - - -class BasicTestBAMFetch(unittest.TestCase): - - '''basic first test - detailed testing - if information in file is consistent - with information in AlignedRead object.''' - - def setUp(self): - self.samfile = pysam.Samfile( - os.path.join(DATADIR, "ex3.bam"), - "rb") - self.reads = list(self.samfile.fetch()) - - def testARqname(self): - self.assertEqual( - self.reads[0].qname, - "read_28833_29006_6945", - "read name mismatch in read 1: %s != %s" % ( - self.reads[0].qname, "read_28833_29006_6945")) - self.assertEqual( - self.reads[1].qname, - "read_28701_28881_323b", - "read name mismatch in read 2: %s != %s" % ( - self.reads[1].qname, "read_28701_28881_323b")) - - def testARflag(self): - self.assertEqual( - self.reads[0].flag, 99, - "flag mismatch in read 1: %s != %s" % ( - self.reads[0].flag, 99)) - self.assertEqual( - self.reads[1].flag, 147, - "flag mismatch in read 2: %s != %s" % ( - self.reads[1].flag, 147)) - - def testARrname(self): - self.assertEqual( - self.reads[0].rname, 0, - "chromosome/target id mismatch in read 1: %s != %s" % - (self.reads[0].rname, 0)) - self.assertEqual( - self.reads[1].rname, 1, - "chromosome/target id mismatch in read 2: %s != %s" % - (self.reads[1].rname, 1)) - - def testARpos(self): - self.assertEqual( - self.reads[0].pos, 33 - 1, - "mapping position mismatch in read 1: %s != %s" % - (self.reads[0].pos, 33 - 1)) - self.assertEqual( - self.reads[1].pos, 88 - 1, - "mapping position mismatch in read 2: %s != %s" % - (self.reads[1].pos, 88 - 1)) - - def testARmapq(self): - self.assertEqual( - self.reads[0].mapq, 20, - "mapping quality mismatch in read 1: %s != %s" % - (self.reads[0].mapq, 20)) - self.assertEqual( - self.reads[1].mapq, 30, - "mapping quality mismatch in read 2: %s != %s" % ( - self.reads[1].mapq, 30)) - - def testARcigar(self): - self.assertEqual( - self.reads[0].cigar, - [(0, 10), (2, 1), (0, 25)], - "read name length mismatch in read 1: %s != %s" % - (self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)])) - self.assertEqual( - self.reads[1].cigar, [(0, 35)], - "read name length mismatch in read 2: %s != %s" % - (self.reads[1].cigar, [(0, 35)])) - - def testARcigarstring(self): - self.assertEqual(self.reads[0].cigarstring, '10M1D25M') - self.assertEqual(self.reads[1].cigarstring, '35M') - - def testARmrnm(self): - self.assertEqual( - self.reads[0].mrnm, 0, - "mate reference sequence name mismatch in read 1: %s != %s" % - (self.reads[0].mrnm, 0)) - self.assertEqual( - self.reads[1].mrnm, 1, - "mate reference sequence name mismatch in read 2: %s != %s" % - (self.reads[1].mrnm, 1)) - self.assertEqual( - self.reads[0].rnext, 0, - "mate reference sequence name mismatch in read 1: %s != %s" % - (self.reads[0].rnext, 0)) - self.assertEqual( - self.reads[1].rnext, 1, - "mate reference sequence name mismatch in read 2: %s != %s" % - (self.reads[1].rnext, 1)) - - def testARmpos(self): - self.assertEqual(self.reads[ - 0].mpos, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].mpos, 200 - 1)) - self.assertEqual(self.reads[ - 1].mpos, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].mpos, 500 - 1)) - self.assertEqual(self.reads[ - 0].pnext, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].pnext, 200 - 1)) - self.assertEqual(self.reads[ - 1].pnext, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].pnext, 500 - 1)) - - def testARisize(self): - self.assertEqual(self.reads[0].isize, 167, "insert size mismatch in read 1: %s != %s" % ( - self.reads[0].isize, 167)) - self.assertEqual(self.reads[1].isize, 412, "insert size mismatch in read 2: %s != %s" % ( - self.reads[1].isize, 412)) - self.assertEqual(self.reads[0].tlen, 167, "insert size mismatch in read 1: %s != %s" % ( - self.reads[0].tlen, 167)) - self.assertEqual(self.reads[1].tlen, 412, "insert size mismatch in read 2: %s != %s" % ( - self.reads[1].tlen, 412)) - - def testARseq(self): - self.assertEqual(self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % ( - self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) - self.assertEqual(self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % ( - self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA")) - self.assertEqual(self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 4: %s != %s" % ( - self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) - - def testARqual(self): - self.assertEqual(self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", - "quality string mismatch in read 1: %s != %s" % (self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) - self.assertEqual(self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % ( - self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")) - self.assertEqual(self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", - "quality string mismatch in read 3: %s != %s" % (self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) - - def testARquery(self): - self.assertEqual(self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "query mismatch in read 1: %s != %s" % ( - self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) - self.assertEqual(self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "query size mismatch in read 2: %s != %s" % ( - self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA")) - self.assertEqual(self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT", "query mismatch in read 4: %s != %s" % ( - self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT")) - - def testARqqual(self): - self.assertEqual( - self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", - "qquality string mismatch in read 1: %s != %s" % - (self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) - self.assertEqual( - self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", - "qquality string mismatch in read 2: %s != %s" % - (self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")) - self.assertEqual( - self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22", - "qquality string mismatch in read 3: %s != %s" % - (self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22")) - - def testPresentOptionalFields(self): - self.assertEqual( - self.reads[0].opt('NM'), 1, - "optional field mismatch in read 1, NM: %s != %s" % - (self.reads[0].opt('NM'), 1)) - self.assertEqual( - self.reads[0].opt('RG'), 'L1', - "optional field mismatch in read 1, RG: %s != %s" % - (self.reads[0].opt('RG'), 'L1')) - self.assertEqual( - self.reads[1].opt('RG'), 'L2', - "optional field mismatch in read 2, RG: %s != %s" % - (self.reads[1].opt('RG'), 'L2')) - self.assertEqual( - self.reads[1].opt('MF'), 18, - "optional field mismatch in read 2, MF: %s != %s" % - (self.reads[1].opt('MF'), 18)) - - def testPairedBools(self): - self.assertEqual(self.reads[0].is_paired, True, - "is paired mismatch in read 1: %s != %s" % ( - self.reads[0].is_paired, True)) - self.assertEqual(self.reads[1].is_paired, True, - "is paired mismatch in read 2: %s != %s" % ( - self.reads[1].is_paired, True)) - self.assertEqual(self.reads[0].is_proper_pair, True, - "is proper pair mismatch in read 1: %s != %s" % ( - self.reads[0].is_proper_pair, True)) - self.assertEqual(self.reads[1].is_proper_pair, True, - "is proper pair mismatch in read 2: %s != %s" % ( - self.reads[1].is_proper_pair, True)) - - def testTags(self): - self.assertEqual(self.reads[0].tags, - [('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U')]) - self.assertEqual(self.reads[1].tags, - [('MF', 18), ('RG', 'L2'), - ('PG', 'P2'), ('XT', 'R')]) - - def testAddTags(self): - self.assertEqual(sorted(self.reads[0].tags), - sorted([('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U')])) - - self.reads[0].setTag('X1', 'C') - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - self.reads[0].setTag('X2', 5) - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X2', 5), ('X1', 'C'), - ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - # add with replacement - self.reads[0].setTag('X2', 10) - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X2', 10), ('X1', 'C'), - ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - - # add without replacement - self.reads[0].setTag('X2', 5, replace=False) - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X2', 10), ('X1', 'C'), - ('X2', 5), - ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - - def testAddTagsType(self): - self.reads[0].tags = None - self.assertEqual(self.reads[0].tags, []) - - self.reads[0].setTag('X1', 5.0) - self.reads[0].setTag('X2', "5.0") - self.reads[0].setTag('X3', 5) - - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 5.0), - ('X2', "5.0"), - ('X3', 5)])) - - # test setting float for int value - self.reads[0].setTag('X4', 5, value_type='d') - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 5.0), - ('X2', "5.0"), - ('X3', 5), - ('X4', 5.0)])) - - # test setting int for float value - the - # value will be rounded. - self.reads[0].setTag('X5', 5.2, value_type='i') - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 5.0), - ('X2', "5.0"), - ('X3', 5), - ('X4', 5.0), - ('X5', 5)])) - - # test setting invalid type code - self.assertRaises(ValueError, self.reads[0].setTag, 'X6', 5.2, 'g') - - def testTagsUpdatingFloat(self): - self.assertEqual(self.reads[0].tags, - [('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U')]) - self.reads[0].tags += [('XC', 5.0)] - self.assertEqual(self.reads[0].tags, - [('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)]) - - def testOpt(self): - self.assertEqual(self.reads[0].opt("XT"), "U") - self.assertEqual(self.reads[1].opt("XT"), "R") - - def testMissingOpt(self): - self.assertRaises(KeyError, self.reads[0].opt, "XP") - - def testEmptyOpt(self): - self.assertRaises(KeyError, self.reads[2].opt, "XT") - - def tearDown(self): - self.samfile.close() - - -class BasicTestBAMFile(BasicTestBAMFetch): - - def setUp(self): - self.samfile = pysam.Samfile( - os.path.join(DATADIR, "ex3.sam"), - "r") - self.reads = [r for r in self.samfile] - - -class BasicTestSAMFile(BasicTestBAMFetch): - - def setUp(self): - self.samfile = pysam.Samfile( - os.path.join(DATADIR, "ex3.sam"), - "r") - self.reads = [r for r in self.samfile] - - -class BasicTestSAMFetch(BasicTestBAMFetch): - - def setUp(self): - self.samfile = pysam.Samfile( - os.path.join(DATADIR, "ex3.sam"), - "r") - self.reads = list(self.samfile.fetch()) - - -# needs to be implemented -# class TestAlignedReadFromSamWithoutHeader(TestAlignedReadFromBam): -# -# def setUp(self): -# self.samfile=pysam.Samfile( "ex7.sam","r" ) -# self.reads=list(self.samfile.fetch()) - - -class TestIO(unittest.TestCase): - - '''check if reading samfile and writing a samfile are consistent.''' - - def checkEcho(self, - input_filename, - reference_filename, - output_filename, - input_mode, output_mode, - use_template=True): - '''iterate through *input_filename* writing to *output_filename* and - comparing the output to *reference_filename*. - - The files are opened according to the *input_mode* and *output_mode*. - - If *use_template* is set, the header is copied from infile - using the template mechanism, otherwise target names and - lengths are passed explicitly. - - ''' - - infile = pysam.Samfile(os.path.join(DATADIR, input_filename), - input_mode) - if use_template: - outfile = pysam.Samfile(output_filename, - output_mode, - template=infile) - else: - outfile = pysam.Samfile(output_filename, - output_mode, - referencenames=infile.references, - referencelengths=infile.lengths, - add_sq_text=False) - - iter = infile.fetch() - - for x in iter: - outfile.write(x) - infile.close() - outfile.close() - - self.assertTrue( - checkBinaryEqual(os.path.join(DATADIR, reference_filename), - output_filename), - "files %s and %s are not the same" % (reference_filename, - output_filename)) - - def testReadWriteBam(self): - - input_filename = "ex1.bam" - output_filename = "pysam_ex1.bam" - reference_filename = "ex1.bam" - - self.checkEcho(input_filename, reference_filename, output_filename, - "rb", "wb", use_template=True) - - # Disabled - should work, files are not binary equal, but are - # non-binary equal: - # diff <(samtools view pysam_ex1.bam) <(samtools view pysam_data/ex1.bam) - # def testReadWriteBamWithTargetNames(self): - # input_filename = "ex1.bam" - # output_filename = "pysam_ex1.bam" - # reference_filename = "ex1.bam" - - # self.checkEcho(input_filename, reference_filename, output_filename, - # "rb", "wb", use_template=False) - - def testReadWriteSamWithHeader(self): - - input_filename = "ex2.sam" - output_filename = "pysam_ex2.sam" - reference_filename = "ex2.sam" - - self.checkEcho(input_filename, - reference_filename, - output_filename, - "r", "wh") - - # Release 0.8.0 - # no samfiles without header - def testReadWriteSamWithoutHeader(self): - - input_filename = "ex2.sam" - output_filename = "pysam_ex2.sam" - reference_filename = "ex1.sam" - - self.checkEcho(input_filename, - reference_filename, - output_filename, - "r", "w") - - def testReadSamWithoutTargetNames(self): - '''see issue 104.''' - input_filename = os.path.join(DATADIR, - "example_unmapped_reads_no_sq.sam") - - # raise exception in default mode - self.assertRaises(ValueError, pysam.Samfile, input_filename, "r") - - # raise exception if no SQ files - self.assertRaises(ValueError, pysam.Samfile, - input_filename, "r", - check_header=True) - - infile = pysam.Samfile( - input_filename, - check_header=False, - check_sq=False) - - # TODO - # result = list(infile.fetch(until_eof=True)) - # self.assertEqual(2, len(result)) - - def testReadBamWithoutTargetNames(self): - '''see issue 104.''' - input_filename = os.path.join( - DATADIR, "example_unmapped_reads_no_sq.bam") - - # raise exception in default mode - self.assertRaises(ValueError, pysam.Samfile, input_filename, "r") - - # raise exception if no SQ files - self.assertRaises(ValueError, pysam.Samfile, input_filename, "r", - check_header=True) - - infile = pysam.Samfile( - input_filename, check_header=False, check_sq=False) - result = list(infile.fetch(until_eof=True)) - - # TODO - def testReadSamWithoutHeader(self): - input_filename = os.path.join(DATADIR, "ex1.sam") - - # reading from a samfile without header is not - # implemented - self.assertRaises(ValueError, - pysam.Samfile, - input_filename, - "r") - - # TODO - # without check_header header is no read - # leading to segfault - # self.assertRaises(ValueError, - # pysam.Samfile, - # input_filename, - # "r", - # check_header=False) - - # TODO - # def testReadUnformattedFile(self): - # '''test reading from a file that is not bam/sam formatted''' - # input_filename = os.path.join(DATADIR, 'Makefile') - - # # bam - file raise error - # self.assertRaises(ValueError, - # pysam.Samfile, - # input_filename, - # "rb") - - # # sam - file error, but can't fetch - # self.assertRaises(ValueError, - # pysam.Samfile, - # input_filename, - # "r") - - # self.assertRaises(ValueError, - # pysam.Samfile, - # input_filename, - # "r", - # check_header=False) - - def testBAMWithoutAlignedReads(self): - '''see issue 117''' - input_filename = os.path.join(DATADIR, "test_unaligned.bam") - samfile = pysam.Samfile(input_filename, "rb", check_sq=False) - samfile.fetch(until_eof=True) - - def testBAMWithShortBAI(self): - '''see issue 116''' - input_filename = os.path.join(DATADIR, "example_bai.bam") - samfile = pysam.Samfile(input_filename, "rb", check_sq=False) - samfile.fetch('chr2') - - def testFetchFromClosedFile(self): - - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - samfile.close() - self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120) - - def testClosedFile(self): - '''test that access to a closed samfile raises ValueError.''' - - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - samfile.close() - self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120) - self.assertRaises(ValueError, samfile.pileup, 'chr1', 100, 120) - self.assertRaises(ValueError, samfile.getrname, 0) - # TODO - self.assertRaises(ValueError, samfile.tell) - self.assertRaises(ValueError, samfile.seek, 0) - self.assertRaises(ValueError, getattr, samfile, "nreferences") - self.assertRaises(ValueError, getattr, samfile, "references") - self.assertRaises(ValueError, getattr, samfile, "lengths") - self.assertRaises(ValueError, getattr, samfile, "text") - self.assertRaises(ValueError, getattr, samfile, "header") - - # write on closed file - self.assertEqual(0, samfile.write(None)) - - def testAutoDetection(self): - '''test if autodetection works.''' - - # TODO - # samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.sam")) - # self.assertRaises(ValueError, samfile.fetch, 'chr1') - # samfile.close() - - samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.bam")) - samfile.fetch('chr1') - samfile.close() - - # TOOD - # def testReadingFromSamFileWithoutHeader(self): - # '''read from samfile without header. - # ''' - # samfile = pysam.Samfile(os.path.join(DATADIR, "ex7.sam"), - # check_header=False, - # check_sq=False) - # self.assertRaises(NotImplementedError, samfile.__iter__) - - def testReadingFromFileWithoutIndex(self): - '''read from bam file without index.''' - - shutil.copyfile(os.path.join(DATADIR, "ex2.bam"), 'tmp_ex2.bam') - samfile = pysam.Samfile('tmp_ex2.bam', - "rb") - self.assertRaises(ValueError, samfile.fetch) - self.assertEqual(len(list(samfile.fetch(until_eof=True))), - 3270) - os.unlink('tmp_ex2.bam') - - # def testReadingUniversalFileMode(self): - # '''read from samfile without header. - # ''' - - # input_filename = "ex2.sam" - # output_filename = "pysam_ex2.sam" - # reference_filename = "ex1.sam" - - # self.checkEcho(input_filename, - # reference_filename, - # output_filename, - # "rU", "w") - - def testHead(self): - '''test IteratorRowHead''' - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - l10 = list(samfile.head(10)) - l100 = list(samfile.head(100)) - self.assertEqual(len(l10), 10) - self.assertEqual(len(l100), 100) - self.assertEqual(list(map(str, l10)), - list(map(str, l100[:10]))) - - -class TestFloatTagBug(unittest.TestCase): - - '''see issue 71''' - - def testFloatTagBug(self): - '''a float tag before another exposed a parsing bug in bam_aux_get. - - Fixed in 0.1.19 - ''' - samfile = pysam.Samfile(os.path.join(DATADIR, "tag_bug.bam")) - read = next(samfile.fetch(until_eof=True)) - self.assertTrue(('XC', 1) in read.tags) - self.assertEqual(read.opt('XC'), 1) - - -class TestLargeFieldBug(unittest.TestCase): - - '''see issue 100''' - - def testLargeFileBug(self): - '''when creating a read with a large entry in the tag field - causes an errror: - NotImplementedError: tags field too large - ''' - samfile = pysam.Samfile(os.path.join(DATADIR, "issue100.bam")) - read = next(samfile.fetch(until_eof=True)) - new_read = pysam.AlignedRead() - new_read.tags = read.tags - self.assertEqual(new_read.tags, read.tags) - - -class TestTagParsing(unittest.TestCase): - - '''tests checking the accuracy of tag setting and retrieval.''' - - def makeRead(self): - a = pysam.AlignedRead() - a.qname = "read_12345" - a.tid = 0 - a.seq = "ACGT" * 3 - a.flag = 0 - a.rname = 0 - a.pos = 1 - a.mapq = 20 - a.cigar = ((0, 10), (2, 1), (0, 25)) - a.mrnm = 0 - a.mpos = 200 - a.isize = 0 - a.qual = "1234" * 3 - # todo: create tags - return a - - def testNegativeIntegers(self): - x = -2 - aligned_read = self.makeRead() - aligned_read.tags = [("XD", int(x))] - # print (aligned_read.tags) - - def testNegativeIntegers2(self): - x = -2 - r = self.makeRead() - r.tags = [("XD", int(x))] - outfile = pysam.Samfile("test.bam", - "wb", - referencenames=("chr1",), - referencelengths = (1000,)) - outfile.write(r) - outfile.close() - - def testCigarString(self): - r = self.makeRead() - self.assertEqual(r.cigarstring, "10M1D25M") - r.cigarstring = "20M10D20M" - self.assertEqual(r.cigar, [(0, 20), (2, 10), (0, 20)]) - # unsetting cigar string - r.cigarstring = None - self.assertEqual(r.cigarstring, None) - - def testCigar(self): - r = self.makeRead() - self.assertEqual(r.cigar, [(0, 10), (2, 1), (0, 25)]) - # unsetting cigar string - r.cigar = None - self.assertEqual(r.cigar, []) - - def testLongTags(self): - '''see issue 115''' - - r = self.makeRead() - rg = 'HS2000-899_199.L3' - tags = [('XC', 85), ('XT', 'M'), ('NM', 5), - ('SM', 29), ('AM', 29), ('XM', 1), - ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'), - ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')] - - r.tags = tags - r.tags += [("RG", rg)] * 100 - tags += [("RG", rg)] * 100 - - self.assertEqual(tags, r.tags) - - -class TestClipping(unittest.TestCase): - - def testClipping(self): - - self.samfile = pysam.Samfile(os.path.join(DATADIR, "softclip.bam"), - "rb") - for read in self.samfile: - - if read.qname == "r001": - self.assertEqual(read.seq, 'AAAAGATAAGGATA') - self.assertEqual(read.query, 'AGATAAGGATA') - self.assertEqual(read.qual, None) - self.assertEqual(read.qqual, None) - - elif read.qname == "r002": - - self.assertEqual(read.seq, 'GCCTAAGCTAA') - self.assertEqual(read.query, 'AGCTAA') - self.assertEqual(read.qual, '01234567890') - self.assertEqual(read.qqual, '567890') - - elif read.qname == "r003": - - self.assertEqual(read.seq, 'GCCTAAGCTAA') - self.assertEqual(read.query, 'GCCTAA') - self.assertEqual(read.qual, '01234567890') - self.assertEqual(read.qqual, '012345') - - elif read.qname == "r004": - - self.assertEqual(read.seq, 'TAGGC') - self.assertEqual(read.query, 'TAGGC') - self.assertEqual(read.qual, '01234') - self.assertEqual(read.qqual, '01234') - - -class TestIteratorRow(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def checkRange(self, rnge): - '''compare results from iterator with those from samtools.''' - ps = list(self.samfile.fetch(region=rnge)) - sa = force_str( - pysam.samtools.view( - os.path.join(DATADIR, "ex1.bam"), - rnge, - raw=True)).splitlines(True) - self.assertEqual( - len(ps), len(sa), - "unequal number of results for range %s: %i != %i" % - (rnge, len(ps), len(sa))) - # check if the same reads are returned and in the same order - for line, (a, b) in enumerate(list(zip(ps, sa))): - d = b.split("\t") - self.assertEqual( - a.qname, d[0], - "line %i: read id mismatch: %s != %s" % - (line, a.rname, d[0])) - self.assertEqual( - a.pos, int(d[3]) - 1, - "line %i: read position mismatch: %s != %s, " - "\n%s\n%s\n" % - (line, a.pos, int(d[3]) - 1, - str(a), str(d))) - qual = d[10] - self.assertEqual( - a.qual, qual, - "line %i: quality mismatch: %s != %s, \n%s\n%s\n" % - (line, a.qual, qual, - str(a), str(d))) - - def testIteratePerContig(self): - '''check random access per contig''' - for contig in self.samfile.references: - self.checkRange(contig) - - def testIterateRanges(self): - '''check random access per range''' - for contig, length in zip(self.samfile.references, self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange("%s:%i-%i" % (contig, start, start + 90)) - - def tearDown(self): - self.samfile.close() - - -class TestIteratorRowAll(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def testIterate(self): - '''compare results from iterator with those from samtools.''' - ps = list(self.samfile.fetch()) - sa = force_str( - pysam.samtools.view( - os.path.join(DATADIR, "ex1.bam"), - raw=True)).splitlines(True) - - self.assertEqual( - len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa))) - # check if the same reads are returned - for line, pair in enumerate(list(zip(ps, sa))): - data = pair[1].split("\t") - self.assertEqual(pair[0].qname, data[ - 0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0])) - - def tearDown(self): - self.samfile.close() - - -class TestIteratorColumn(unittest.TestCase): - - '''test iterator column against contents of ex4.bam.''' - - # note that samfile contains 1-based coordinates - # 1D means deletion with respect to reference sequence - # - mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35), - 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35), - } - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex4.bam"), - "rb") - - def checkRange(self, contig, start=None, end=None, truncate=False): - '''compare results from iterator with those from samtools.''' - # check if the same reads are returned and in the same order - for column in self.samfile.pileup(contig, start, end, - truncate=truncate): - if truncate: - self.assertGreaterEqual(column.pos, start) - self.assertLess(column.pos, end) - thiscov = len(column.pileups) - refcov = self.mCoverages[ - self.samfile.getrname(column.tid)][column.pos] - self.assertEqual( - thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % ( - self.samfile.getrname(column.tid), column.pos, thiscov, refcov)) - - def testIterateAll(self): - '''check random access per contig''' - self.checkRange(None) - - def testIteratePerContig(self): - '''check random access per contig''' - for contig in self.samfile.references: - self.checkRange(contig) - - def testIterateRanges(self): - '''check random access per range''' - for contig, length in zip( - self.samfile.references, self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange(contig, start, start + 90) - - def testInverse(self): - '''test the inverse, is point-wise pileup accurate.''' - for contig, refseq in list(self.mCoverages.items()): - refcolumns = sum(refseq) - for pos, refcov in enumerate(refseq): - columns = list(self.samfile.pileup(contig, pos, pos + 1)) - if refcov == 0: - # if no read, no coverage - self.assertEqual( - len(columns), - refcov, - "wrong number of pileup columns returned for position %s:%i, %i should be %i" % ( - contig, pos, - len(columns), refcov)) - elif refcov == 1: - # one read, all columns of the read are returned - self.assertEqual( - len(columns), - refcolumns, - "pileup incomplete at position %i: got %i, expected %i " % - (pos, len(columns), refcolumns)) - - def testIterateTruncate(self): - '''check random access per range''' - for contig, length in zip(self.samfile.references, self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange(contig, start, start + 90, truncate=True) - - def tearDown(self): - self.samfile.close() - - -class TestIteratorColumn2(unittest.TestCase): - - '''test iterator column against contents of ex1.bam.''' - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def testStart(self): - # print self.samfile.fetch().next().pos - # print self.samfile.pileup().next().pos - pass - - def testTruncate(self): - '''see issue 107.''' - # note that ranges in regions start from 1 - p = self.samfile.pileup(region='chr1:170:172', truncate=True) - columns = [x.pos for x in p] - self.assertEqual(len(columns), 3) - self.assertEqual(columns, [169, 170, 171]) - - p = self.samfile.pileup('chr1', 169, 172, truncate=True) - columns = [x.pos for x in p] - - self.assertEqual(len(columns), 3) - self.assertEqual(columns, [169, 170, 171]) - - def testAccessOnClosedIterator(self): - '''see issue 131 - - Accessing pileup data after iterator has closed. - ''' - pcolumn = self.samfile.pileup('chr1', 170, 180).__next__() - self.assertRaises(ValueError, getattr, pcolumn, "pileups") - - -class TestHeaderSam(unittest.TestCase): - - header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, - {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], - 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"}, - {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}], - 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], - 'HD': {'VN': '1.0'}, - 'CO': ['this is a comment', 'this is another comment'], - } - - def compareHeaders(self, a, b): - '''compare two headers a and b.''' - for ak, av in a.items(): - self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) - self.assertEqual(av, b[ak]) - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.sam"), - "r") - - def testHeaders(self): - self.compareHeaders(self.header, self.samfile.header) - self.compareHeaders(self.samfile.header, self.header) - - def testNameMapping(self): - for x, y in enumerate(("chr1", "chr2")): - tid = self.samfile.gettid(y) - ref = self.samfile.getrname(x) - self.assertEqual(tid, x) - self.assertEqual(ref, y) - - self.assertEqual(self.samfile.gettid("chr?"), -1) - self.assertRaises(ValueError, self.samfile.getrname, 2) - - def tearDown(self): - self.samfile.close() - - -class TestHeaderBam(TestHeaderSam): - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.bam"), - "rb") - - -class TestHeaderFromRefs(unittest.TestCase): - - '''see issue 144 - - reference names need to be converted to string for python 3 - ''' - - # def testHeader( self ): - # refs = ['chr1', 'chr2'] - # tmpfile = "tmp_%i" % id(self) - # s = pysam.Samfile(tmpfile, 'wb', - # referencenames=refs, - # referencelengths=[100]*len(refs)) - # s.close() - - # self.assertTrue( checkBinaryEqual( 'issue144.bam', tmpfile ), - # 'bam files differ') - # os.unlink( tmpfile ) - - -class TestHeader1000Genomes(unittest.TestCase): - - '''see issue 110''' - # bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase2b_alignment/data/NA07048/exome_alignment/NA07048.unmapped.ILLUMINA.bwa.CEU.exome.20120522_p2b.bam" - bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" - - def testRead(self): - - if not checkURL(self.bamfile): - return - - f = pysam.Samfile(self.bamfile, "rb") - data = f.header.copy() - self.assertTrue(data) - - -class TestUnmappedReads(unittest.TestCase): - - # TODO - # def testSAM(self): - # samfile = pysam.Samfile(os.path.join(DATADIR, "ex5.sam"), - # "r") - # self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2) - # samfile.close() - - def testBAM(self): - samfile = pysam.Samfile(os.path.join(DATADIR, "ex5.bam"), - "rb") - self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2) - samfile.close() - - -class TestPileupObjects(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def testPileupColumn(self): - for pcolumn1 in self.samfile.pileup(region="chr1:105"): - if pcolumn1.pos == 104: - self.assertEqual( - pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0)) - self.assertEqual( - pcolumn1.pos, 105 - 1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105 - 1)) - self.assertEqual( - pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2)) - for pcolumn2 in self.samfile.pileup(region="chr2:1480"): - if pcolumn2.pos == 1479: - self.assertEqual( - pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1)) - self.assertEqual( - pcolumn2.pos, 1480 - 1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480 - 1)) - self.assertEqual( - pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12)) - - def testPileupRead(self): - for pcolumn1 in self.samfile.pileup(region="chr1:105"): - if pcolumn1.pos == 104: - self.assertEqual( - len(pcolumn1.pileups), 2, - "# reads aligned to column mismatch in position 1" - ": %s != %s" % - (len(pcolumn1.pileups), 2)) - - -# self.assertEqual( pcolumn1.pileups[0] # need to test additional -# properties here - - def tearDown(self): - self.samfile.close() - - def testIteratorOutOfScope(self): - '''test if exception is raised if pileup col is accessed after - iterator is exhausted.''' - - for pileupcol in self.samfile.pileup(): - pass - - self.assertRaises(ValueError, getattr, pileupcol, "pileups") - - -class TestContextManager(unittest.TestCase): - - def testManager(self): - with pysam.Samfile(os.path.join(DATADIR, 'ex1.bam'), - 'rb') as samfile: - samfile.fetch() - self.assertEqual(samfile.closed, True) - - -class TestExceptions(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def testMissingFile(self): - - self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.bam", "rb") - self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.sam", "r") - self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.bam", "r") - self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.sam", "rb") - - def testBadContig(self): - self.assertRaises(ValueError, self.samfile.fetch, "chr88") - - def testMeaninglessCrap(self): - self.assertRaises(ValueError, self.samfile.fetch, "skljf") - - def testBackwardsOrderNewFormat(self): - self.assertRaises(ValueError, self.samfile.fetch, 'chr1', 100, 10) - - def testBackwardsOrderOldFormat(self): - self.assertRaises(ValueError, self.samfile.fetch, region="chr1:100-10") - - def testOutOfRangeNegativeNewFormat(self): - self.assertRaises(ValueError, self.samfile.fetch, "chr1", 5, -10) - self.assertRaises(ValueError, self.samfile.fetch, "chr1", 5, 0) - self.assertRaises(ValueError, self.samfile.fetch, "chr1", -5, -10) - - self.assertRaises(ValueError, self.samfile.count, "chr1", 5, -10) - self.assertRaises(ValueError, self.samfile.count, "chr1", 5, 0) - self.assertRaises(ValueError, self.samfile.count, "chr1", -5, -10) - - def testOutOfRangeNegativeOldFormat(self): - self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5-10") - self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5-0") - self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5--10") - - self.assertRaises(ValueError, self.samfile.count, region="chr1:-5-10") - self.assertRaises(ValueError, self.samfile.count, region="chr1:-5-0") - self.assertRaises(ValueError, self.samfile.count, region="chr1:-5--10") - - def testOutOfRangNewFormat(self): - self.assertRaises( - ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999) - self.assertRaises( - ValueError, self.samfile.count, "chr1", 9999999999, 99999999999) - - def testOutOfRangeLargeNewFormat(self): - self.assertRaises(ValueError, self.samfile.fetch, "chr1", - 9999999999999999999999999999999, 9999999999999999999999999999999999999999) - self.assertRaises(ValueError, self.samfile.count, "chr1", - 9999999999999999999999999999999, 9999999999999999999999999999999999999999) - - def testOutOfRangeLargeOldFormat(self): - self.assertRaises( - ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999") - self.assertRaises( - ValueError, self.samfile.count, "chr1:99999999999999999-999999999999999999") - - def testZeroToZero(self): - '''see issue 44''' - self.assertEqual(len(list(self.samfile.fetch('chr1', 0, 0))), 0) - - def tearDown(self): - self.samfile.close() - - -class TestWrongFormat(unittest.TestCase): - - '''test cases for opening files not in bam/sam format.''' - - def testOpenSamAsBam(self): - self.assertRaises(ValueError, - pysam.Samfile, - os.path.join(DATADIR, 'ex1.sam'), - 'rb') - - def testOpenBamAsSam(self): - # test fails, needs to be implemented. - # sam.fetch() fails on reading, not on opening - # self.assertRaises( ValueError, pysam.Samfile, 'ex1.bam', 'r' ) - pass - - def testOpenFastaAsSam(self): - # test fails, needs to be implemented. - # sam.fetch() fails on reading, not on opening - # self.assertRaises( ValueError, pysam.Samfile, 'ex1.fa', 'r' ) - pass - - def testOpenFastaAsBam(self): - self.assertRaises(ValueError, - pysam.Samfile, - os.path.join(DATADIR, 'ex1.fa'), - 'rb') - - -class ReadTest(unittest.TestCase): - - def checkFieldEqual(self, read1, read2, exclude=[]): - '''check if two reads are equal by comparing each field.''' - - # add the . for refactoring purposes. - for x in (".qname", ".seq", ".flag", - ".rname", ".pos", ".mapq", ".cigar", - ".mrnm", ".mpos", ".isize", - ".qual", - ".bin", - ".is_paired", ".is_proper_pair", - ".is_unmapped", ".mate_is_unmapped", - ".is_reverse", ".mate_is_reverse", - ".is_read1", ".is_read2", - ".is_secondary", ".is_qcfail", - ".is_duplicate"): - n = x[1:] - if n in exclude: - continue - self.assertEqual(getattr(read1, n), getattr(read2, n), - "attribute mismatch for %s: %s != %s" % - (n, getattr(read1, n), getattr(read2, n))) - - -class TestAlignedRead(ReadTest): - - '''tests to check if aligned read can be constructed - and manipulated. - ''' - - def testEmpty(self): - a = pysam.AlignedRead() - self.assertEqual(a.qname, None) - self.assertEqual(a.seq, None) - self.assertEqual(a.qual, None) - self.assertEqual(a.flag, 0) - self.assertEqual(a.rname, -1) - self.assertEqual(a.mapq, 0) - self.assertEqual(a.cigar, []) - self.assertEqual(a.tags, []) - self.assertEqual(a.mrnm, -1) - self.assertEqual(a.mpos, -1) - self.assertEqual(a.isize, 0) - - def testStrOfEmptyRead(self): - a = pysam.AlignedRead() - s = str(a) - self.assertEqual( - "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]", - s) - - def buildRead(self): - '''build an example read.''' - - a = pysam.AlignedRead() - a.qname = "read_12345" - a.seq = "ACGT" * 10 - a.flag = 0 - a.rname = 0 - a.pos = 20 - a.mapq = 20 - a.cigar = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) - a.mrnm = 0 - a.mpos = 200 - a.isize = 167 - a.qual = "1234" * 10 - # todo: create tags - return a - - def testUpdate(self): - '''check if updating fields affects other variable length data - ''' - a = self.buildRead() - b = self.buildRead() - - # check qname - b.qname = "read_123" - self.checkFieldEqual(a, b, "qname") - b.qname = "read_12345678" - self.checkFieldEqual(a, b, "qname") - b.qname = "read_12345" - self.checkFieldEqual(a, b) - - # check cigar - b.cigar = ((0, 10), ) - self.checkFieldEqual(a, b, "cigar") - b.cigar = ((0, 10), (2, 1), (0, 10)) - self.checkFieldEqual(a, b, "cigar") - b.cigar = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) - self.checkFieldEqual(a, b) - - # check seq - b.seq = "ACGT" - self.checkFieldEqual(a, b, ("seq", "qual")) - b.seq = "ACGT" * 3 - self.checkFieldEqual(a, b, ("seq", "qual")) - b.seq = "ACGT" * 10 - self.checkFieldEqual(a, b, ("qual",)) - - # reset qual - b = self.buildRead() - - # check flags: - for x in ( - "is_paired", "is_proper_pair", - "is_unmapped", "mate_is_unmapped", - "is_reverse", "mate_is_reverse", - "is_read1", "is_read2", - "is_secondary", "is_qcfail", - "is_duplicate"): - setattr(b, x, True) - self.assertEqual(getattr(b, x), True) - self.checkFieldEqual(a, b, ("flag", x,)) - setattr(b, x, False) - self.assertEqual(getattr(b, x), False) - self.checkFieldEqual(a, b) - - def testUpdate2(self): - '''issue 135: inplace update of sequence and quality score. - - This does not work as setting the sequence will erase - the quality scores. - ''' - a = self.buildRead() - a.seq = a.seq[5:10] - self.assertEqual(a.qual, None) - - a = self.buildRead() - s = a.qual - a.seq = a.seq[5:10] - a.qual = s[5:10] - - self.assertEqual(a.qual, s[5:10]) - - def testLargeRead(self): - '''build an example read.''' - - a = pysam.AlignedRead() - a.qname = "read_12345" - a.seq = "ACGT" * 200 - a.flag = 0 - a.rname = 0 - a.pos = 20 - a.mapq = 20 - a.cigar = ((0, 4 * 200), ) - a.mrnm = 0 - a.mpos = 200 - a.isize = 167 - a.qual = "1234" * 200 - - return a - - def testTagParsing(self): - '''test for tag parsing - - see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a - ''' - samfile = pysam.Samfile(os.path.join(DATADIR, "ex8.bam"), - "rb") - - for entry in samfile: - before = entry.tags - entry.tags = entry.tags - after = entry.tags - self.assertEqual(after, before) - - def testUpdateTlen(self): - '''check if updating tlen works''' - a = self.buildRead() - oldlen = a.tlen - oldlen *= 2 - a.tlen = oldlen - self.assertEqual(a.tlen, oldlen) - - def testPositions(self): - a = self.buildRead() - self.assertEqual(a.positions, - [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]) - - self.assertEqual(a.aligned_pairs, - [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24), - (5, 25), (6, 26), (7, 27), (8, 28), (9, 29), - (None, 30), - (10, 31), (11, 32), (12, 33), (13, 34), (14, 35), - (15, 36), (16, 37), (17, 38), (18, 39), (19, None), - (20, 40), (21, 41), (22, 42), (23, 43), (24, 44), - (25, 45), (26, 46), (27, 47), (28, 48), (29, 49), - (30, 50), (31, 51), (32, 52), (33, 53), (34, 54), - (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)]) - - self.assertEqual( - a.positions, - [x[1] for x in a.aligned_pairs - if x[0] is not None and x[1] is not None]) - # alen is the length of the aligned read in genome - self.assertEqual(a.alen, a.aligned_pairs[-1][0] + 1) - # aend points to one beyond last aligned base in ref - self.assertEqual(a.positions[-1], a.aend - 1) - - def testBlocks(self): - a = self.buildRead() - self.assertEqual(a.blocks, - [(20, 30), (31, 40), (40, 60)]) - - # Disabled as not backwards compatible - # def testFancyStr(self): - # a = self.buildRead() - # output = a.fancy_str() - # self.assertEqual(len(output), 9) - - -class TestDeNovoConstruction(ReadTest): - - '''check BAM/SAM file construction using ex6.sam - - (note these are +1 coordinates): - - read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 - read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 - ''' - - header = {'HD': {'VN': '1.0'}, - 'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], } - - bamfile = os.path.join(DATADIR, "ex6.bam") - samfile = os.path.join(DATADIR, "ex6.sam") - - def setUp(self): - - a = pysam.AlignedRead() - a.qname = "read_28833_29006_6945" - a.seq = "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" - a.flag = 99 - a.rname = 0 - a.pos = 32 - a.mapq = 20 - a.cigar = ((0, 10), (2, 1), (0, 25)) - a.mrnm = 0 - a.mpos = 199 - a.isize = 167 - a.qual = "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<" - a.tags = (("NM", 1), - ("RG", "L1")) - - b = pysam.AlignedRead() - b.qname = "read_28701_28881_323b" - b.seq = "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA" - b.flag = 147 - b.rname = 1 - b.pos = 87 - b.mapq = 30 - b.cigar = ((0, 35), ) - b.mrnm = 1 - b.mpos = 499 - b.isize = 412 - b.qual = "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<" - b.tags = (("MF", 18), - ("RG", "L2")) - - self.reads = (a, b) - - # TODO - # def testSAMWholeFile(self): - - # tmpfilename = "tmp_%i.sam" % id(self) - - # outfile = pysam.Samfile(tmpfilename, - # "wh", - # header=self.header) - - # for x in self.reads: - # outfile.write(x) - # outfile.close() - # self.assertTrue(checkBinaryEqual(tmpfilename, self.samfile), - # "mismatch when construction SAM file, see %s %s" % (tmpfilename, self.samfile)) - - # os.unlink(tmpfilename) - - def testBAMPerRead(self): - '''check if individual reads are binary equal.''' - infile = pysam.Samfile(self.bamfile, "rb") - - others = list(infile) - for denovo, other in zip(others, self.reads): - self.checkFieldEqual(other, denovo) - self.assertEqual(other.compare(denovo), 0) - - # TODO - # def testSAMPerRead(self): - # '''check if individual reads are binary equal.''' - # infile = pysam.Samfile(self.samfile, "r") - - # others = list(infile) - # for denovo, other in zip(others, self.reads): - # self.checkFieldEqual(other, denovo) - # self.assertEqual(other.compare(denovo), 0) - - def testBAMWholeFile(self): - - tmpfilename = "tmp_%i.bam" % id(self) - - outfile = pysam.Samfile(tmpfilename, "wb", header=self.header) - - for x in self.reads: - outfile.write(x) - outfile.close() - - self.assertTrue(checkBinaryEqual(tmpfilename, self.bamfile), - "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile)) - - os.unlink(tmpfilename) - - -class TestDeNovoConstructionUserTags(TestDeNovoConstruction): - - '''test de novo construction with a header that contains lower-case tags.''' - - header = {'HD': {'VN': '1.0'}, - 'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], - 'x1': {'A': 2, 'B': 5}, - 'x3': {'A': 6, 'B': 5}, - 'x2': {'A': 4, 'B': 5}} - - bamfile = os.path.join(DATADIR, "example_user_header.bam") - samfile = os.path.join(DATADIR, "example_user_header.sam") - - -class TestEmptyHeader(unittest.TestCase): - - '''see issue 84.''' - - def testEmptyHeader(self): - - s = pysam.Samfile(os.path.join(DATADIR, 'example_empty_header.bam')) - self.assertEqual(s.header, {'SQ': [{'LN': 1000, 'SN': 'chr1'}]}) - -COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204, - 0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78, - 197, 0, 7, 100, 95, 101, 202, 0, 6, 0, 1, - 186, 0, 84, 0, 244, 0, 0, 324, 0, 107, 195, - 101, 113, 0, 102, 0, 104, 3, 0, 101, 1, 0, - 212, 6, 0, 0, 1, 0, 74, 1, 11, 0, 196, 2, - 197, 103, 0, 108, 98, 2, 7, 0, 1, 2, 194, - 0, 180, 0, 108, 0, 203, 104, 16, 5, 205, - 0, 0, 0, 1, 1, 100, 98, 0, 0, 204, 6, 0, - 79, 0, 0, 101, 7, 109, 90, 265, 1, 27, 10, - 109, 102, 9, 0, 292, 0, 110, 0, 0, 102, - 112, 0, 0, 84, 100, 103, 2, 81, 126, 0, 2, - 90, 0, 15, 96, 15, 1, 0, 2, 0, 107, 92, 0, - 0, 101, 3, 98, 15, 102, 13, 116, 116, 90, 93, - 198, 0, 0, 0, 199, 92, 26, 495, 100, 5, 0, - 100, 5, 209, 0, 92, 107, 90, 0, 0, 0, 0, 109, - 194, 7, 94, 200, 0, 40, 197, 0, 11, 0, 0, 112, - 110, 6, 4, 200, 28, 0, 196, 0, 203, 1, 129, - 0, 0, 1, 0, 94, 0, 1, 0, 107, 5, 201, 3, 3, 100, - 0, 121, 0, 7, 0, 1, 105, 306, 3, 86, 8, 183, 0, - 12, 163, 17, 83, 22, 0, 0, 1, 8, 109, 103, 0, 0, - 295, 0, 200, 16, 172, 3, 16, 182, 3, 11, 0, 0, - 223, 111, 103, 0, 5, 225, 0, 95] - - -class TestBTagSam(unittest.TestCase): - - '''see issue 81.''' - - compare = [COMPARE_BTAG, - [-100, 200, -300, -400], - [-100, 12], - [12, 15], - [-1.0, 5.0, 2.5]] - - filename = os.path.join(DATADIR, 'example_btag.sam') - - read0 = [('RG', 'QW85I'), - ('PG', 'tmap'), - ('MD', '140'), - ('NM', 0), - ('AS', 140), - ('FZ', array.array('H', COMPARE_BTAG)), - ('XA', 'map2-1'), - ('XS', 53), - ('XT', 38), - ('XF', 1), - ('XE', 0)] - - def testReadTags(self): - - s = pysam.Samfile(self.filename) - for x, read in enumerate(s): - tags = read.tags - if x == 0: - self.assertEqual(tags, self.read0) - - fz = list(dict(tags)["FZ"]) - self.assertEqual(fz, self.compare[x]) - self.assertEqual(list(read.opt("FZ")), self.compare[x]) - self.assertEqual(tags, read.get_tags()) - for tag, value in tags: - self.assertEqual(value, read.get_tag(tag)) - - def testReadWriteTags(self): - - s = pysam.Samfile(self.filename) - for read in s: - before = read.tags - read.tags = before - self.assertEqual(read.tags, before) - - read.set_tags(before) - self.assertEqual(read.tags, before) - - for tag, value in before: - read.set_tag(tag, value) - self.assertEqual(value, read.get_tag(tag)) - - -class TestBTagBam(TestBTagSam): - filename = os.path.join(DATADIR, 'example_btag.bam') - - -class TestDoubleFetch(unittest.TestCase): - - '''check if two iterators on the same bamfile are independent.''' - - filename = os.path.join(DATADIR, 'ex1.bam') - - def testDoubleFetch(self): - - samfile1 = pysam.Samfile(self.filename, 'rb') - - for a, b in zip(samfile1.fetch(multiple_iterators=True), - samfile1.fetch(multiple_iterators=True)): - self.assertEqual(a.compare(b), 0) - - def testDoubleFetchWithRegion(self): - - samfile1 = pysam.Samfile(self.filename, 'rb') - chr, start, stop = 'chr1', 200, 3000000 - # just making sure the test has something to catch - self.assertTrue(len(list(samfile1.fetch(chr, start, stop))) > 0) - - for a, b in zip(samfile1.fetch(chr, start, stop), - samfile1.fetch(chr, start, stop, - multiple_iterators=True)): - self.assertEqual(a.compare(b), 0) - - def testDoubleFetchUntilEOF(self): - - samfile1 = pysam.Samfile(self.filename, 'rb') - - for a, b in zip(samfile1.fetch(until_eof=True), - samfile1.fetch(until_eof=True, - multiple_iterators=True)): - self.assertEqual(a.compare(b), 0) - - -class TestRemoteFileFTP(unittest.TestCase): - - '''test remote access. - - ''' - - # Need to find an ftp server without password on standard - # port. - - url = "ftp://ftp.sanger.ac.uk/pub/rd/humanSequences/CV.bam" - region = "1:1-1000" - - def testFTPView(self): - return - if not checkURL(self.url): - return - - result = pysam.samtools.view(self.url, self.region) - self.assertEqual(len(result), 36) - - def testFTPFetch(self): - return - if not checkURL(self.url): - return - - samfile = pysam.Samfile(self.url, "rb") - result = list(samfile.fetch(region=self.region)) - self.assertEqual(len(result), 36) - - -class TestRemoteFileHTTP(unittest.TestCase): - - url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/ex1.bam" - region = "chr1:1-1000" - local = os.path.join(DATADIR, "ex1.bam") - - def testView(self): - if not checkURL(self.url): - return - - samfile_local = pysam.Samfile(self.local, "rb") - ref = list(samfile_local.fetch(region=self.region)) - - result = pysam.samtools.view( - self.url, self.region).splitlines(True) - self.assertEqual(len(result), len(ref)) - - def testFetch(self): - if not checkURL(self.url): - return - - samfile = pysam.Samfile(self.url, "rb") - result = list(samfile.fetch(region=self.region)) - samfile_local = pysam.Samfile(self.local, "rb") - ref = list(samfile_local.fetch(region=self.region)) - - self.assertEqual(len(ref), len(result)) - for x, y in zip(result, ref): - self.assertEqual(x.compare(y), 0) - - def testFetchAll(self): - if not checkURL(self.url): - return - - samfile = pysam.Samfile(self.url, "rb") - result = list(samfile.fetch()) - samfile_local = pysam.Samfile(self.local, "rb") - ref = list(samfile_local.fetch()) - - self.assertEqual(len(ref), len(result)) - for x, y in zip(result, ref): - self.assertEqual(x.compare(y), 0) - - -class TestLargeOptValues(unittest.TestCase): - - ints = (65536, 214748, 2147484, 2147483647) - floats = (65536.0, 214748.0, 2147484.0) - - def check(self, samfile): - - i = samfile.fetch() - for exp in self.ints: - rr = next(i) - obs = rr.opt("ZP") - self.assertEqual(exp, obs, - "expected %s, got %s\n%s" % - (str(exp), str(obs), str(rr))) - - for exp in [-x for x in self.ints]: - rr = next(i) - obs = rr.opt("ZP") - self.assertEqual(exp, obs, - "expected %s, got %s\n%s" % - (str(exp), str(obs), str(rr))) - - for exp in self.floats: - rr = next(i) - obs = rr.opt("ZP") - self.assertEqual(exp, obs, - "expected %s, got %s\n%s" % - (str(exp), str(obs), str(rr))) - - for exp in [-x for x in self.floats]: - rr = next(i) - obs = rr.opt("ZP") - self.assertEqual(exp, obs, "expected %s, got %s\n%s" % - (str(exp), str(obs), str(rr))) - - def testSAM(self): - samfile = pysam.Samfile( - os.path.join(DATADIR, "ex10.sam"), - "r") - self.check(samfile) - - def testBAM(self): - samfile = pysam.Samfile( - os.path.join(DATADIR, "ex10.bam"), - "rb") - self.check(samfile) - - -class TestPileup(unittest.TestCase): - - '''test pileup functionality.''' - - samfilename = "pysam_data/ex1.bam" - fastafilename = "pysam_data/ex1.fa" - - def setUp(self): - - self.samfile = pysam.Samfile(self.samfilename) - self.fastafile = pysam.Fastafile(self.fastafilename) - - def checkEqual(self, references, iterator): - - for x, column in enumerate(iterator): - (contig, pos, reference_base, - read_bases, read_qualities, alignment_mapping_qualities) \ - = references[x][:-1].split("\t") - self.assertEqual(int(pos) - 1, column.pos) - - def testSamtoolsStepper(self): - refs = force_str( - pysam.samtools.mpileup( - "-f", self.fastafilename, - self.samfilename)).splitlines(True) - iterator = self.samfile.pileup( - stepper="samtools", - fastafile=self.fastafile) - self.checkEqual(refs, iterator) - - def testAllStepper(self): - refs = force_str( - pysam.samtools.mpileup( - "-f", self.fastafilename, - "-A", "-B", - self.samfilename)).splitlines(True) - - iterator = self.samfile.pileup( - stepper="all", - fastafile=self.fastafile) - self.checkEqual(refs, iterator) - - -class TestLogging(unittest.TestCase): - - '''test around bug issue 42, - - failed in versions < 0.4 - ''' - - def check(self, bamfile, log): - - if log: - logger = logging.getLogger('franklin') - logger.setLevel(logging.INFO) - formatter = logging.Formatter( - '%(asctime)s %(levelname)s %(message)s') - log_hand = logging.FileHandler('log.txt') - log_hand.setFormatter(formatter) - logger.addHandler(log_hand) - - bam = pysam.Samfile(bamfile, 'rb') - cols = bam.pileup() - self.assertTrue(True) - - def testFail1(self): - self.check(os.path.join(DATADIR, "ex9_fail.bam"), - False) - self.check(os.path.join(DATADIR, "ex9_fail.bam"), - True) - - def testNoFail1(self): - self.check(os.path.join(DATADIR, "ex9_nofail.bam"), - False) - self.check(os.path.join(DATADIR, "ex9_nofail.bam"), - True) - - def testNoFail2(self): - self.check(os.path.join(DATADIR, "ex9_nofail.bam"), - True) - self.check(os.path.join(DATADIR, "ex9_nofail.bam"), - True) - -# TODOS -# 1. finish testing all properties within pileup objects -# 2. check exceptions and bad input problems (missing files, optional fields that aren't present, etc...) -# 3. check: presence of sequence - - -class TestSamfileUtilityFunctions(unittest.TestCase): - - def testCount(self): - - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - for contig in ("chr1", "chr2"): - for start in range(0, 2000, 100): - end = start + 1 - self.assertEqual( - len(list(samfile.fetch(contig, start, end))), - samfile.count(contig, start, end), - 'number mismatch for %s:%i-%i %i != %i' % ( - contig, start, end, - len(list(samfile.fetch(contig, start, end))), - samfile.count(contig, start, end))) - - # test empty intervals - self.assertEqual( - len(list(samfile.fetch(contig, start, start))), - samfile.count(contig, start, start), - 'number mismatch for %s:%i-%i %i != %i' % ( - contig, start, start, - len(list(samfile.fetch(contig, start, start))), - samfile.count(contig, start, start))) - - # test half empty intervals - self.assertEqual(len(list(samfile.fetch(contig, start))), - samfile.count(contig, start)) - - self.assertEqual( - len(list(samfile.fetch(contig, start))), - samfile.count(contig, start), - 'number mismatch for %s:%i %i != %i' % ( - contig, start, - len(list(samfile.fetch(contig, start))), - samfile.count(contig, start))) - - def testMate(self): - '''test mate access.''' - - with open(os.path.join(DATADIR, "ex1.sam"), "rb") as inf: - readnames = [x.split(b"\t")[0] for x in inf.readlines()] - if sys.version_info[0] >= 3: - readnames = [name.decode('ascii') for name in readnames] - - counts = collections.defaultdict(int) - for x in readnames: - counts[x] += 1 - - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - for read in samfile.fetch(): - if not read.is_paired: - self.assertRaises(ValueError, samfile.mate, read) - elif read.mate_is_unmapped: - self.assertRaises(ValueError, samfile.mate, read) - else: - if counts[read.qname] == 1: - self.assertRaises(ValueError, samfile.mate, read) - else: - mate = samfile.mate(read) - self.assertEqual(read.qname, mate.qname) - self.assertEqual(read.is_read1, mate.is_read2) - self.assertEqual(read.is_read2, mate.is_read1) - self.assertEqual(read.pos, mate.mpos) - self.assertEqual(read.mpos, mate.pos) - - def testIndexStats(self): - '''test if total number of mapped/unmapped reads is correct.''' - - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - self.assertEqual(samfile.mapped, 3235) - self.assertEqual(samfile.unmapped, 35) - self.assertEqual(samfile.nocoordinate, 0) - - -class TestSamtoolsProxy(unittest.TestCase): - - '''tests for sanity checking access to samtools functions.''' - - def testIndex(self): - self.assertRaises(IOError, pysam.samtools.index, "missing_file") - - def testView(self): - # note that view still echos "open: No such file or directory" - self.assertRaises(pysam.SamtoolsError, pysam.samtools.view, "missing_file") - - def testSort(self): - self.assertRaises(pysam.SamtoolsError, pysam.samtools.sort, "missing_file") - - -class TestSamfileIndex(unittest.TestCase): - - def testIndex(self): - samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"), - "rb") - index = pysam.IndexedReads(samfile) - index.build() - reads = collections.defaultdict(int) - - for read in samfile: - reads[read.qname] += 1 - - for qname, counts in reads.items(): - found = list(index.find(qname)) - self.assertEqual(len(found), counts) - for x in found: - self.assertEqual(x.qname, qname) - - -if __name__ == "__main__": - # build data files - print ("building data files") - subprocess.call("make -C %s" % DATADIR, shell=True) - print ("starting tests") - unittest.main() - print ("completed tests") diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py index ce59da7..de54de5 100644 --- a/tests/StreamFiledescriptors_test.py +++ b/tests/StreamFiledescriptors_test.py @@ -1,4 +1,5 @@ import os +import sys import subprocess import threading import errno @@ -6,6 +7,8 @@ import unittest from pysam import AlignmentFile +IS_PYTHON2 = sys.version_info[0] == 2 + DATADIR = os.path.abspath(os.path.join( os.path.dirname(__file__), "pysam_data")) @@ -13,7 +16,7 @@ DATADIR = os.path.abspath(os.path.join( def alignmentfile_writer_thread(infile, outfile): def _writer_thread(infile, outfile): - """read from infile and write to outfile""" + """read from infile and write to outfile""" try: i = 0 for record in infile: @@ -41,42 +44,48 @@ class StreamTest(unittest.TestCase): read += 1 return 0, read + @unittest.skipIf(IS_PYTHON2, "no context manager in py2") def test_text_processing(self): - proc = subprocess.Popen('head -n200', - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - shell=True) + with subprocess.Popen('head -n200', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=True) as proc: - in_stream = AlignmentFile('pysam_data/ex1.bam') - out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header) - writer = alignmentfile_writer_thread(in_stream, - out_stream) + in_stream = AlignmentFile('pysam_data/ex1.bam') + out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header) + writer = alignmentfile_writer_thread(in_stream, + out_stream) - written, read = self.stream_process(proc, - in_stream, - out_stream, - writer) - self.assertEqual(read, 198) + written, read = self.stream_process(proc, + in_stream, + out_stream, + writer) + self.assertEqual(read, 198) + @unittest.skip("test contains bug") def test_samtools_processing(self): - - proc = subprocess.Popen('samtools view -b -f 4', - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - shell=True) - - in_stream = AlignmentFile('pysam_data/ex1.bam') - out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header) - writer = alignmentfile_writer_thread(in_stream, - out_stream) - - written, read = self.stream_process(proc, - in_stream, - out_stream, - writer) - self.assertEqual(read, 35) - + + # The following test causes the suite to hang + # as the stream_processor raises: + # ValueError: file has no sequences defined (mode='r') - is it SAM/BAM format? + # The whole setup then hangs during exception handling. + with subprocess.Popen('samtools view -b -f 4', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=True) as proc: + + in_stream = AlignmentFile('pysam_data/ex1.bam') + out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header) + writer = alignmentfile_writer_thread(in_stream, + out_stream) + + written, read = self.stream_process(proc, + in_stream, + out_stream, + writer) + self.assertEqual(read, 35) + if __name__ == "__main__": unittest.main() diff --git a/tests/TestUtils.py b/tests/TestUtils.py index 71ab22a..1168926 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -78,7 +78,6 @@ def check_samtools_view_equal( '''return true if the two files are equal in their content through samtools view. ''' - # strip MD and NM tags, as not preserved in CRAM files args = ["-x", "MD", "-x", "NM"] if not without_header: @@ -161,8 +160,10 @@ def check_lines_equal(cls, a, b, sort=False, filter_f=None, msg=None): filter_f: remover lines in both a and b where expression is True """ - aa = openfile(a).readlines() - bb = openfile(b).readlines() + with openfile(a) as inf: + aa = inf.readlines() + with openfile(b) as inf: + bb = inf.readlines() if filter_f is not None: aa = [x for x in aa if not filter_f(x)] @@ -183,3 +184,28 @@ def get_temp_filename(suffix=""): dir=".") f.close() return f.name + + +def load_and_convert(filename, encode=True): + '''load data from filename and convert all fields to string. + + Filename can be either plain or compressed (ending in .gz). + ''' + data = [] + if filename.endswith(".gz"): + with gzip.open(filename) as inf: + for line in inf: + line = line.decode("ascii") + if line.startswith("#"): + continue + d = line.strip().split("\t") + data.append(d) + else: + with open(filename) as f: + for line in f: + if line.startswith("#"): + continue + d = line.strip().split("\t") + data.append(d) + + return data diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index aa82c66..93307e9 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -10,10 +10,9 @@ try: except ImportError: Path = None -from TestUtils import get_temp_filename, check_lines_equal +from TestUtils import get_temp_filename, check_lines_equal, load_and_convert DATADIR="cbcf_data" -from tabix_test import loadAndConvert def read_header(filename): @@ -37,7 +36,7 @@ class TestMissingGenotypes(unittest.TestCase): filename = "missing_genotypes.vcf" def setUp(self): - self.compare = loadAndConvert( + self.compare = load_and_convert( os.path.join(DATADIR, self.filename), encode=False) diff --git a/tests/faidx_test.py b/tests/faidx_test.py index a123550..c87394d 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -222,15 +222,27 @@ class TestRemoteFileFTP(unittest.TestCase): url = "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa" - def testFTPView(self): if not checkURL(self.url): return + with pysam.Fastafile(self.url) as f: self.assertEqual( len(f.fetch("chr1", 0, 1000)), 1000) + def test_sequence_lengths_are_available(self): + if not checkURL(self.url): + return + + with pysam.Fastafile(self.url) as f: + self.assertEqual(len(f.references), 3366) + self.assertTrue("chr1" in f.references) + self.assertEqual(f.lengths[0], + 248956422) + self.assertEqual(f.get_reference_length("chr1"), + 248956422) + if __name__ == "__main__": unittest.main() diff --git a/tests/samtools_test.py b/tests/samtools_test.py index aa4c554..7eec832 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -63,7 +63,10 @@ class SamtoolsTest(unittest.TestCase): "ex1.fa", "ex1.fa.fai", "ex1.sam.gz", "ex1.bam", "ex1.bam.bai", - "ex1.sam", "ex2.bam", + "ex1.sam", + "ex1.sam", + "ex2.bam", + "ex2.sam", "ex1.bed"] # a list of statements to test @@ -92,7 +95,7 @@ class SamtoolsTest(unittest.TestCase): # unknow option # "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam", # "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam", - "reheader ex1.sam ex1.bam > %(out)s_ex1.reheader", + "reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam", "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam", "targetcut ex1.bam > %(out)s_ex1.targetcut", "phase ex1.bam > %(out)s_ex1.phase", @@ -143,7 +146,6 @@ class SamtoolsTest(unittest.TestCase): files. ''' - self.check_version() if not os.path.exists(WORKDIR): @@ -158,14 +160,23 @@ class SamtoolsTest(unittest.TestCase): return + def get_command(self, statement, map_to_internal=True): + """return samtools command from statement""" + parts = statement.split(" ") + command = parts[0] + if map_to_internal: + return self.map_command.get(command, command) + else: + return command + def check_statement(self, statement): parts = statement.split(" ") r_samtools = {"out": self.executable} r_pysam = {"out": "pysam"} - command = parts[0] - command = self.map_command.get(command, command) + command = self.get_command(statement) + # self.assertTrue(command in pysam.SAMTOOLS_DISPATCH) targets = [x for x in parts if "%(out)s" in x] @@ -217,9 +228,10 @@ class SamtoolsTest(unittest.TestCase): check_samtools_view_equal( s, p, without_header=True), error_msg) - check_lines_equal( - self, s, p, - filter_f=lambda x: x.startswith("#"), + else: + check_lines_equal( + self, s, p, + filter_f=lambda x: x.startswith("#"), msg=error_msg) def testStatements(self): @@ -232,6 +244,22 @@ class SamtoolsTest(unittest.TestCase): continue self.check_statement(statement) + @unittest.skipIf(sys.platform == "darwin", "not supported, pattern does not match") + def testUsage(self): + if self.executable == "bcftools": + # bcftools usage messages end with exit(1) + return + + for statement in self.statements: + command = self.get_command(statement, map_to_internal=False) + if command == "bam2fq": + continue + mapped_command = self.get_command(statement, map_to_internal=True) + pysam_method = getattr(self.module, mapped_command) + usage_msg = pysam_method.usage() + expected = "Usage:\s+{} {}".format(self.executable, command) + self.assertTrue(re.search(expected, usage_msg) is not None) + def tearDown(self): if os.path.exists(WORKDIR): shutil.rmtree(WORKDIR) @@ -342,7 +370,8 @@ class BcftoolsTest(SamtoolsTest): # "filter -s A ex1.vcf.gz > %(out)s_ex1.filter", # exit # "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck", - "roh -s A ex1.vcf.gz > %(out)s_ex1.roh", + # segfauld, used to work wit bcftools 1.3 + # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh", "stats ex1.vcf.gz > %(out)s_ex1.stats", ] diff --git a/tests/tabix_data/example.gff2.gz b/tests/tabix_data/example.gff2.gz new file mode 100644 index 0000000..4084a74 Binary files /dev/null and b/tests/tabix_data/example.gff2.gz differ diff --git a/tests/tabix_data/example.gff2.gz.tbi b/tests/tabix_data/example.gff2.gz.tbi new file mode 100644 index 0000000..30d39ae Binary files /dev/null and b/tests/tabix_data/example.gff2.gz.tbi differ diff --git a/tests/tabix_data/example.gff3.gz b/tests/tabix_data/example.gff3.gz new file mode 100644 index 0000000..b42b41b Binary files /dev/null and b/tests/tabix_data/example.gff3.gz differ diff --git a/tests/tabix_data/example.gff3.gz.tbi b/tests/tabix_data/example.gff3.gz.tbi new file mode 100644 index 0000000..855e139 Binary files /dev/null and b/tests/tabix_data/example.gff3.gz.tbi differ diff --git a/tests/tabix_test.py b/tests/tabix_test.py index ec1e37e..87de282 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -14,7 +14,7 @@ import unittest import glob import re import copy -from TestUtils import checkURL +from TestUtils import checkURL, load_and_convert DATADIR = 'tabix_data' @@ -35,31 +35,6 @@ def myzip_open(infile, mode="r"): return gzip.open(mode) -def loadAndConvert(filename, encode=True): - '''load data from filename and convert all fields to string. - - Filename can be either plain or compressed (ending in .gz). - ''' - data = [] - if filename.endswith(".gz"): - with gzip.open(filename) as inf: - for line in inf: - line = line.decode("ascii") - if line.startswith("#"): - continue - d = line.strip().split("\t") - data.append(d) - else: - with open(filename) as f: - for line in f: - if line.startswith("#"): - continue - d = line.strip().split("\t") - data.append(d) - - return data - - def splitToBytes(s): '''split string and return list of bytes.''' return [x.encode("ascii") for x in s.split("\t")] @@ -396,150 +371,8 @@ class TestIterationWithComments(TestIterationWithoutComments): TestIterationWithoutComments.setUp(self) -class TestParser(unittest.TestCase): - - filename = os.path.join(DATADIR, "example.gtf.gz") - - def setUp(self): - - self.tabix = pysam.TabixFile(self.filename) - self.compare = loadAndConvert(self.filename) - - def tearDown(self): - self.tabix.close() - - def testRead(self): - - for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): - c = self.compare[x] - self.assertEqual(c, list(r)) - self.assertEqual(len(c), len(r)) - - # test indexing - for y in range(0, len(r)): - self.assertEqual(c[y], r[y]) - - # test slicing access - for y in range(0, len(r) - 1): - for cc in range(y + 1, len(r)): - self.assertEqual(c[y:cc], - r[y:cc]) - self.assertEqual("\t".join(map(str, c)), - str(r)) - - def testWrite(self): - - for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): - self.assertEqual(self.compare[x], list(r)) - c = list(r) - for y in range(len(r)): - r[y] = "test_%05i" % y - c[y] = "test_%05i" % y - self.assertEqual([x for x in c], list(r)) - self.assertEqual("\t".join(c), str(r)) - # check second assignment - for y in range(len(r)): - r[y] = "test_%05i" % y - self.assertEqual([x for x in c], list(r)) - self.assertEqual("\t".join(c), str(r)) - - def testUnset(self): - for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): - self.assertEqual(self.compare[x], list(r)) - c = list(r) - e = list(r) - for y in range(len(r)): - r[y] = None - c[y] = None - e[y] = "" - self.assertEqual(c, list(r)) - self.assertEqual("\t".join(e), str(r)) - - def testIteratorCompressed(self): - '''test iteration from compressed file.''' - with gzip.open(self.filename) as infile: - for x, r in enumerate(pysam.tabix_iterator( - infile, pysam.asTuple())): - self.assertEqual(self.compare[x], list(r)) - self.assertEqual(len(self.compare[x]), len(r)) - - # test indexing - for c in range(0, len(r)): - self.assertEqual(self.compare[x][c], r[c]) - - # test slicing access - for c in range(0, len(r) - 1): - for cc in range(c + 1, len(r)): - self.assertEqual(self.compare[x][c:cc], - r[c:cc]) - - def testIteratorUncompressed(self): - '''test iteration from uncompressed file.''' - tmpfilename = 'tmp_testIteratorUncompressed' - with gzip.open(self.filename, "rb") as infile, \ - open(tmpfilename, "wb") as outfile: - outfile.write(infile.read()) - - with open(tmpfilename) as infile: - for x, r in enumerate(pysam.tabix_iterator( - infile, pysam.asTuple())): - self.assertEqual(self.compare[x], list(r)) - self.assertEqual(len(self.compare[x]), len(r)) - - # test indexing - for c in range(0, len(r)): - self.assertEqual(self.compare[x][c], r[c]) - - # test slicing access - for c in range(0, len(r) - 1): - for cc in range(c + 1, len(r)): - self.assertEqual(self.compare[x][c:cc], - r[c:cc]) - - os.unlink(tmpfilename) - - def testCopy(self): - a = self.tabix.fetch(parser=pysam.asTuple()).next() - b = copy.copy(a) - self.assertEqual(a, b) - - a = self.tabix.fetch(parser=pysam.asGTF()).next() - b = copy.copy(a) - self.assertEqual(a, b) - - -class TestGTF(TestParser): - - def testRead(self): - - for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())): - c = self.compare[x] - self.assertEqual(len(c), len(r)) - self.assertEqual(list(c), list(r)) - self.assertEqual(c, str(r).split("\t")) - self.assertTrue(r.gene_id.startswith("ENSG")) - if r.feature != 'gene': - self.assertTrue(r.transcript_id.startswith("ENST")) - self.assertEqual(c[0], r.contig) - self.assertEqual("\t".join(map(str, c)), - str(r)) - - def testSetting(self): - - for r in self.tabix.fetch(parser=pysam.asGTF()): - r.contig = r.contig + "_test" - r.source = r.source + "_test" - r.feature = r.feature + "_test" - r.start += 10 - r.end += 10 - r.score = 20 - r.strand = "+" - r.frame = 0 - r.attributes = 'gene_id "0001";' - - + class TestIterators(unittest.TestCase): - filename = os.path.join(DATADIR, "example.gtf.gz") iterator = pysam.tabix_generic_iterator @@ -549,7 +382,7 @@ class TestIterators(unittest.TestCase): def setUp(self): self.tabix = pysam.TabixFile(self.filename) - self.compare = loadAndConvert(self.filename) + self.compare = load_and_convert(self.filename) self.tmpfilename_uncompressed = 'tmp_TestIterators' with gzip.open(self.filename, "rb") as infile, \ open(self.tmpfilename_uncompressed, "wb") as outfile: @@ -622,7 +455,6 @@ class TestIterationMalformattedGTFFiles(unittest.TestCase): '''test reading from malformatted gtf files.''' - parser = pysam.asGTF iterator = pysam.tabix_generic_iterator parser = pysam.asGTF @@ -653,7 +485,7 @@ class TestBed(unittest.TestCase): def setUp(self): self.tabix = pysam.TabixFile(self.filename) - self.compare = loadAndConvert(self.filename) + self.compare = load_and_convert(self.filename) def tearDown(self): self.tabix.close() @@ -751,7 +583,7 @@ class TestVCFFromTabix(TestVCF): TestVCF.setUp(self) self.tabix = pysam.TabixFile(self.tmpfilename + ".gz") - self.compare = loadAndConvert(self.filename) + self.compare = load_and_convert(self.filename) def tearDown(self): self.tabix.close() @@ -858,42 +690,44 @@ class TestVCFFromVCF(TestVCF): TestVCF.setUp(self) self.vcf = pysam.VCF() - self.compare = loadAndConvert(self.filename, encode=False) + self.compare = load_and_convert(self.filename, encode=False) def tearDown(self): self.vcf.close() - def testConnecting(self): + def open_vcf(self, fn): + return self.vcf.connect(fn) + + def get_failure_stage(self): fn = os.path.basename(self.filename) for x, msg in self.fail_on_opening: - if "%i.vcf" % x == fn: - self.assertRaises(ValueError, - self.vcf.connect, - self.tmpfilename + ".gz") - else: - self.vcf.connect(self.tmpfilename + ".gz") + if "{}.vcf".format(x) == fn: + return "opening" + + for x, msg in self.fail_on_parsing: + if "{}.vcf".format(x) == fn: + return "parsing" + + for x, msg in self.fail_on_samples: + if "{}.vcf".format(x) == fn: + return "samples" + + return None + + def testConnecting(self): + + if self.get_failure_stage() == "opening": + self.assertRaises(ValueError, + self.open_vcf, + self.tmpfilename + ".gz") + else: + self.open_vcf(self.tmpfilename + ".gz") def get_iterator(self): with open(self.filename) as f: fn = os.path.basename(self.filename) - - for x, msg in self.fail_on_opening: - if "%i.vcf" % x == fn: - self.assertRaises(ValueError, self.vcf.parse, f) - return - - for vcf_code, msg in self.fail_on_parsing: - if "%i.vcf" % vcf_code == fn: - self.assertRaises((ValueError, - AssertionError), - list, self.vcf.parse(f)) - return - # python 2.7 - # self.assertRaisesRegexp( - # ValueError, re.compile(msg), self.vcf.parse, f) - return list(self.vcf.parse(f)) def get_field_value(self, record, field): @@ -918,22 +752,15 @@ class TestVCFFromVCF(TestVCF): def testParsing(self): + if self.get_failure_stage() in ("opening", "parsing"): + return + itr = self.get_iterator() if itr is None: return fn = os.path.basename(self.filename) - for vcf_code, msg in self.fail_on_parsing: - if "%i.vcf" % vcf_code == fn: - self.assertRaises((ValueError, - AssertionError), - list, itr) - return - # python 2.7 - # self.assertRaisesRegexp( - # ValueError, re.compile(msg), self.vcf.parse, f) - check_samples = self.check_samples for vcf_code, msg in self.fail_on_samples: if "%i.vcf" % vcf_code == fn: @@ -1079,8 +906,14 @@ class TestVCFFromVariantFile(TestVCFFromVCF): "ref", "alts", "qual", "filter", "info", "format") - fail_on_parsing = [] - fail_on_opening = [] + fail_on_parsing = [ + (24, "Could not parse the header, sample line not found"), + ("issue85", "empty VCF"), + ] + fail_on_opening = [ + (24, "Could not parse the header, sample line not found"), + ("issue85", "empty VCF"), + ] coordinate_offset = 0 check_samples = True fail_on_samples = [ @@ -1134,7 +967,7 @@ class TestVCFFromVariantFile(TestVCFFromVCF): def setUp(self): TestVCF.setUp(self) - self.compare = loadAndConvert(self.filename, encode=False) + self.compare = load_and_convert(self.filename, encode=False) def tearDown(self): if self.vcf: @@ -1148,9 +981,14 @@ class TestVCFFromVariantFile(TestVCFFromVCF): def get_field_value(self, record, field): return getattr(record, field) + def open_vcf(self, fn): + with pysam.VariantFile(fn) as inf: + pass + for vcf_file in vcf_files: - n = "TestVCFFromVariantFile_%s" % os.path.basename(vcf_file[:-4]) + p = os.path.basename(vcf_file[:-4]) + n = "TestVCFFromVariantFile_%s" % p globals()[n] = type(n, (TestVCFFromVariantFile,), dict(filename=vcf_file,)) @@ -1241,7 +1079,7 @@ class TestBackwardsCompatibility(unittest.TestCase): def check(self, filename, raises=None): with pysam.TabixFile(filename) as tf: - ref = loadAndConvert(filename) + ref = load_and_convert(filename) if raises is None: self.assertEqual(len(list(tf.fetch())), len(ref)) else: diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py new file mode 100644 index 0000000..cff0e59 --- /dev/null +++ b/tests/tabixproxies_test.py @@ -0,0 +1,318 @@ +import unittest +import pysam +import os +import sys +import re +import copy +import gzip +from TestUtils import load_and_convert + +DATADIR = 'tabix_data' + + +class TestParser(unittest.TestCase): + + filename = os.path.join(DATADIR, "example.gtf.gz") + + def setUp(self): + + self.tabix = pysam.TabixFile(self.filename) + self.compare = load_and_convert(self.filename) + + def tearDown(self): + self.tabix.close() + + def testRead(self): + + for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): + c = self.compare[x] + self.assertEqual(c, list(r)) + self.assertEqual(len(c), len(r)) + + # test indexing + for y in range(0, len(r)): + self.assertEqual(c[y], r[y]) + + # test slicing access + for y in range(0, len(r) - 1): + for cc in range(y + 1, len(r)): + self.assertEqual(c[y:cc], + r[y:cc]) + self.assertEqual("\t".join(map(str, c)), + str(r)) + + def testWrite(self): + + for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): + self.assertEqual(self.compare[x], list(r)) + c = list(r) + for y in range(len(r)): + r[y] = "test_%05i" % y + c[y] = "test_%05i" % y + self.assertEqual([x for x in c], list(r)) + self.assertEqual("\t".join(c), str(r)) + # check second assignment + for y in range(len(r)): + r[y] = "test_%05i" % y + self.assertEqual([x for x in c], list(r)) + self.assertEqual("\t".join(c), str(r)) + + def testUnset(self): + for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): + self.assertEqual(self.compare[x], list(r)) + c = list(r) + e = list(r) + for y in range(len(r)): + r[y] = None + c[y] = None + e[y] = "" + self.assertEqual(c, list(r)) + self.assertEqual("\t".join(e), str(r)) + + def testIteratorCompressed(self): + '''test iteration from compressed file.''' + with gzip.open(self.filename) as infile: + for x, r in enumerate(pysam.tabix_iterator( + infile, pysam.asTuple())): + self.assertEqual(self.compare[x], list(r)) + self.assertEqual(len(self.compare[x]), len(r)) + + # test indexing + for c in range(0, len(r)): + self.assertEqual(self.compare[x][c], r[c]) + + # test slicing access + for c in range(0, len(r) - 1): + for cc in range(c + 1, len(r)): + self.assertEqual(self.compare[x][c:cc], + r[c:cc]) + + def testIteratorUncompressed(self): + '''test iteration from uncompressed file.''' + tmpfilename = 'tmp_testIteratorUncompressed' + with gzip.open(self.filename, "rb") as infile, \ + open(tmpfilename, "wb") as outfile: + outfile.write(infile.read()) + + with open(tmpfilename) as infile: + for x, r in enumerate(pysam.tabix_iterator( + infile, pysam.asTuple())): + self.assertEqual(self.compare[x], list(r)) + self.assertEqual(len(self.compare[x]), len(r)) + + # test indexing + for c in range(0, len(r)): + self.assertEqual(self.compare[x][c], r[c]) + + # test slicing access + for c in range(0, len(r) - 1): + for cc in range(c + 1, len(r)): + self.assertEqual(self.compare[x][c:cc], + r[c:cc]) + + os.unlink(tmpfilename) + + def testCopy(self): + a = self.tabix.fetch(parser=pysam.asTuple()).next() + b = copy.copy(a) + self.assertEqual(a, b) + + a = self.tabix.fetch(parser=pysam.asGTF()).next() + b = copy.copy(a) + self.assertEqual(a, b) + + +class TestGTF(TestParser): + + parser = pysam.asGTF + + def testRead(self): + + for x, r in enumerate(self.tabix.fetch(parser=self.parser())): + c = self.compare[x] + self.assertEqual(len(c), len(r)) + self.assertEqual(list(c), list(r)) + self.assertEqual(c, str(r).split("\t")) + self.assertTrue(r.gene_id.startswith("ENSG")) + if r.feature != 'gene': + self.assertTrue(r.transcript_id.startswith("ENST")) + self.assertEqual(c[0], r.contig) + self.assertEqual("\t".join(map(str, c)), + str(r)) + + def testSetting(self): + + r = self.tabix.fetch(parser=self.parser()).next() + + r.contig = r.contig + "_test_contig" + r.source = r.source + "_test_source" + r.feature = r.feature + "_test_feature" + r.start += 10 + r.end += 10 + r.score = 20 + r.strand = "+" + r.frame = 0 + r.attributes = 'gene_id "0001";' + r.transcript_id = "0002" + sr = str(r) + self.assertTrue("_test_contig" in sr) + self.assertTrue("_test_source" in sr) + self.assertTrue("_test_feature" in sr) + self.assertTrue("gene_id \"0001\"" in sr) + self.assertTrue("transcript_id \"0002\"" in sr) + + def test_added_attribute_is_output(self): + r = self.tabix.fetch(parser=self.parser()).next() + + r.new_int_attribute = 12 + self.assertTrue("new_int_attribute 12" in str(r).split("\t")[8]) + + r.new_float_attribute = 12.0 + self.assertTrue("new_float_attribute 12.0" in str(r).split("\t")[8]) + + r.new_text_attribute = "abc" + self.assertTrue("new_text_attribute \"abc\"" in str(r).split("\t")[8]) + + def test_setting_start_is_one_based(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.start = 1800 + self.assertEqual(r.start, 1800) + self.assertEqual(str(r).split("\t")[3], "1801") + + def test_setting_end_is_one_based(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.end = 2100 + self.assertEqual(r.end, 2100) + self.assertEqual(str(r).split("\t")[4], "2100") + + def test_setting_frame_to_none_produces_dot(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.frame = None + self.assertEqual(str(r).split("\t")[7], ".") + + r.frame = 2 + self.assertEqual(str(r).split("\t")[7], "2") + + r = self.tabix.fetch(parser=self.parser()).next() + r.frame = "." + self.assertEqual(r.frame, None) + self.assertEqual(str(r).split("\t")[7], ".") + + def test_setting_source_to_none_produces_dot(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.source = None + self.assertEqual(str(r).split("\t")[1], ".") + + r.source = "source" + self.assertEqual(str(r).split("\t")[1], "source") + + r = self.tabix.fetch(parser=self.parser()).next() + r.source = "." + self.assertEqual(r.source, None) + self.assertEqual(str(r).split("\t")[1], ".") + + def test_setting_feature_to_none_produces_dot(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.feature = None + self.assertEqual(str(r).split("\t")[2], ".") + + r.feature = "feature" + self.assertEqual(str(r).split("\t")[2], "feature") + + r = self.tabix.fetch(parser=self.parser()).next() + r.feature = "." + self.assertEqual(r.feature, None) + self.assertEqual(str(r).split("\t")[2], ".") + + def test_setting_strand_to_none_produces_dot(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.strand = None + self.assertEqual(str(r).split("\t")[6], ".") + + r.strand = "-" + self.assertEqual(str(r).split("\t")[6], "-") + + r = self.tabix.fetch(parser=self.parser()).next() + r.strand = "." + self.assertEqual(r.strand, None) + self.assertEqual(str(r).split("\t")[6], ".") + + def test_setting_score_to_none_produces_dot(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.score = None + self.assertEqual(str(r).split("\t")[5], ".") + + r.score = 12.0 + self.assertEqual(str(r).split("\t")[5], "12.0") + + r.score = -12.0 + self.assertEqual(str(r).split("\t")[5], "-12.0") + + r = self.tabix.fetch(parser=self.parser()).next() + r.score = "." + self.assertEqual(r.score, None) + self.assertEqual(str(r).split("\t")[5], ".") + + r.score = 12 + self.assertEqual(str(r).split("\t")[5], "12") + + r.score = -12 + self.assertEqual(str(r).split("\t")[5], "-12") + + +class TestGFF3(TestGTF): + + parser = pysam.asGFF3 + filename = os.path.join(DATADIR, "example.gff3.gz") + + def testRead(self): + for x, r in enumerate(self.tabix.fetch(parser=self.parser())): + c = self.compare[x] + self.assertEqual(len(c), len(r)) + self.assertEqual(list(c), list(r)) + self.assertEqual(c, str(r).split("\t")) + self.assertEqual(c[0], r.contig) + self.assertEqual("\t".join(map(str, c)), + str(r)) + self.assertTrue(r.ID.startswith("MI00")) + + def testSetting(self): + + for r in self.tabix.fetch(parser=self.parser()): + r.contig = r.contig + "_test_contig" + r.source = "test_source" + r.feature = "test_feature" + r.start += 10 + r.end += 10 + r.score = 20 + r.strand = "+" + r.frame = 0 + r.ID="test" + sr = str(r) + self.assertTrue("test_contig" in sr) + self.assertTrue("test_source" in sr) + self.assertTrue("test_feature" in sr) + self.assertTrue("ID=test" in sr) + + def test_added_attribute_is_output(self): + r = self.tabix.fetch(parser=self.parser()).next() + + r.new_int_attribute = 12 + self.assertTrue("new_int_attribute=12" in str(r).split("\t")[8]) + + r.new_float_attribute = 12.0 + self.assertTrue("new_float_attribute=12.0" in str(r).split("\t")[8]) + + r.new_text_attribute = "abc" + self.assertTrue("new_text_attribute=abc" in str(r).split("\t")[8]) + + +if __name__ == "__main__": + unittest.main()