curl
-Pysam requires Python (2.6 or greater) and Cython (0.22 or greater).
+Pysam requires Python (2.7 or greater) and Cython (0.22 or greater).
It has not been tested on many other platforms.
Compilation
#define BCFTOOLS_H
#include <stdarg.h>
+#include <htslib/hts_defs.h>
#include <htslib/vcf.h>
#include <math.h>
#define FT_STDIN (1<<3)
char *bcftools_version(void);
-void error(const char *format, ...);
+void error(const char *format, ...) HTS_NORETURN;
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
{"chain",1,0,'c'},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
{
-// fprintf(pysamerr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
+// fprintf(pysam_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
chain = (chain_t*) calloc(1,sizeof(chain_t));
chain->num = 0;
chain->block_lengths = NULL;
static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
{
-// fprintf(pysamerr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
+// fprintf(pysam_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
int num = chain->num;
if (ref_start <= chain->ref_last_block_ori) {
args->fp_out = fopen(args->output_fname,"w");
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
- else args->fp_out = stdout;
+ else args->fp_out = pysam_stdout;
}
static void destroy_data(args_t *args)
}
}
args->rid = bcf_hdr_name2id(args->hdr,line);
- if ( args->rid<0 ) fprintf(pysamerr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
+ if ( args->rid<0 ) fprintf(pysam_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
args->fa_length = 0;
args->fa_end_pos = to;
if ( rec->pos <= args->fa_frz_pos )
{
- fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ fprintf(pysam_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
if ( args->mask )
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
{
- // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
+ // fprintf(pysam_stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
char tmp = 0;
if ( args->fa_buf.l - idx > rec->rlen )
{
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Create consensus sequence by applying VCF variants to a reference\n");
- fprintf(pysamerr, " fasta file.\n");
- fprintf(pysamerr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(pysamerr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
- fprintf(pysamerr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
- fprintf(pysamerr, " -m, --mask <file> replace regions with N\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -c, --chain <file> write a chain file for liftover\n");
- fprintf(pysamerr, " -s, --sample <name> apply variants of the given sample\n");
- fprintf(pysamerr, "Examples:\n");
- fprintf(pysamerr, " # Get the consensus for one region. The fasta header lines are then expected\n");
- fprintf(pysamerr, " # in the form \">chr:from-to\".\n");
- fprintf(pysamerr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference\n");
+ fprintf(pysam_stderr, " fasta file.\n");
+ fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(pysam_stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
+ fprintf(pysam_stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(pysam_stderr, " -m, --mask <file> replace regions with N\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -c, --chain <file> write a chain file for liftover\n");
+ fprintf(pysam_stderr, " -s, --sample <name> apply variants of the given sample\n");
+ fprintf(pysam_stderr, "Examples:\n");
+ fprintf(pysam_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
+ fprintf(pysam_stderr, " # in the form \">chr:from-to\".\n");
+ fprintf(pysam_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
{"chain",1,0,'c'},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
- default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break;
+ default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
}
else if ( fmt->subscript >=0 )
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
- default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break;
+ default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
}
else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
{
fmt->type = T_INFO;
- fprintf(pysamerr,"Warning: Assuming INFO/%s\n", key);
+ fprintf(pysam_stderr,"Warning: Assuming INFO/%s\n", key);
}
}
}
char *p = convert->format_str;
while ( *p )
{
- //fprintf(pysamerr,"<%s>\n", p);
+ //fprintf(pysam_stderr,"<%s>\n", p);
switch (*p)
{
case '[': is_gtf = 1; p++; break;
minaux1_t *a = (minaux1_t*)data;
double p = 1., l = 0., f3[3];
int i;
-// printf("brent %lg\n", f);
+// fprintf(pysam_stdout, "brent %lg\n", f);
if (f < 0 || f > 1) return 1e300;
f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
for (i = a->beg; i < a->end; ++i) {
{
double f0 = *f, f3[3], err;
int i;
-// printf("em %lg\n", *f);
+// fprintf(pysam_stdout, "em %lg\n", *f);
f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
for (i = beg, f0 = 0.; i < end; ++i) {
const double *pdg = _pdg + i * 3;
double err, gg[3];
int i;
gg[0] = gg[1] = gg[2] = 0.;
-// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
+// fprintf(pysam_stdout, "%lg,%lg,%lg\n", g[0], g[1], g[2]);
for (i = beg; i < end; ++i) {
double sum, tmp[3];
const double *pdg = _pdg + i * 3;
{
double ff[4];
int i, k, h;
-// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
+// fprintf(pysam_stdout, "%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
memset(ff, 0, 4 * sizeof(double));
for (i = 0; i < n; ++i) {
double *p[2], sum, tmp;
case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
- default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break;
+ default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
return -1; // this shouldn't happen
case BCF_BT_INT8: BRANCH(int8_t); break;
case BCF_BT_INT16: BRANCH(int16_t); break;
case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(pysamerr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
+ default: fprintf(pysam_stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
}
#undef BRANCH
{ \
if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
} \
- /*fprintf(pysamerr,"pass=%d\n", pass_site);*/ \
+ /*fprintf(pysam_stderr,"pass=%d\n", pass_site);*/ \
(ret) = pass_site; \
} \
}
if ( tok->tok_type==TOK_VAL )
{
if ( tok->key )
- fprintf(pysamerr,"%s", tok->key);
+ fprintf(pysam_stderr,"%s", tok->key);
else if ( tok->tag )
- fprintf(pysamerr,"%s", tok->tag);
+ fprintf(pysam_stderr,"%s", tok->tag);
else
- fprintf(pysamerr,"%e", tok->threshold);
+ fprintf(pysam_stderr,"%e", tok->threshold);
}
else
- fprintf(pysamerr,"%c", TOKEN_STRING[tok->tok_type]);
- if ( tok->setter ) fprintf(pysamerr,"\t[setter %p]", tok->setter);
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"%c", TOKEN_STRING[tok->tok_type]);
+ if ( tok->setter ) fprintf(pysam_stderr,"\t[setter %p]", tok->setter);
+ fprintf(pysam_stderr,"\n");
}
}
ret = filters_next_token(&tmp, &len);
if ( ret==-1 ) error("Missing quotes in: %s\n", str);
- //fprintf(pysamerr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
- //int i; for (i=0; i<nops; i++) fprintf(pysamerr," .%c.", TOKEN_STRING[ops[i]]); fprintf(pysamerr,"\n");
+ //fprintf(pysam_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+ //int i; for (i=0; i<nops; i++) fprintf(pysam_stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(pysam_stderr,"\n");
if ( ret==TOK_LFT ) // left bracket
{
/* khash_str2str.h -- C-string to C-string hash table.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014,2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
kh_destroy(str2str, hash);
}
+/*
+ * Destroys the hash structure, the keys and the values
+ */
+static inline void khash_str2str_destroy_free_all(void *_hash)
+{
+ khash_t(str2str) *hash = (khash_t(str2str)*)_hash;
+ khint_t k;
+ if (hash == 0) return;
+ for (k = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k))
+ {
+ free((char*)kh_key(hash, k));
+ free((char*)kh_val(hash, k));
+ }
+ kh_destroy(str2str, hash);
+}
+
/*
* Returns value if key exists or NULL if not
*/
/* main.c -- main bcftools command front-end.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if (argc < 2) { usage(stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
/* main.c -- main bcftools command front-end.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int bcftools_main(int argc, char *argv[])
{
- if (argc < 2) { usage(pysamerr); return 1; }
+ if (argc < 2) { usage(pysam_stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ fprintf(pysam_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
- printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
+ fprintf(pysam_stdout, "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
- printf("License Expat: The MIT/Expat license\n");
+ fprintf(pysam_stdout, "License Expat: The MIT/Expat license\n");
#endif
- printf("This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n");
+ fprintf(pysam_stdout, "This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n");
return 0;
}
else if (strcmp(argv[1], "--version-only") == 0) {
- printf("%s+htslib-%s\n", bcftools_version(), hts_version());
+ fprintf(pysam_stdout, "%s+htslib-%s\n", bcftools_version(), hts_version());
return 0;
}
else if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) {
- if (argc == 2) { usage(stdout); return 0; }
+ if (argc == 2) { usage(pysam_stdout); return 0; }
// Otherwise change "bcftools help COMMAND [...]" to "bcftools COMMAND";
// main_xyz() functions by convention display the subcommand's usage
// when invoked without any arguments.
}
i++;
}
- fprintf(pysamerr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]);
+ fprintf(pysam_stderr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]);
return 1;
}
call->theta *= aM;
if ( call->theta >= 1 )
{
- fprintf(pysamerr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta);
+ fprintf(pysam_stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta);
call->theta = 0.99;
}
call->theta = log(call->theta);
double q = 2*fref*falt; // probability of a het, assuming HWE
double mean = q*ndiploid;
- //fprintf(pysamerr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
+ //fprintf(pysam_stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
// Can we use normal approximation? The second condition is for performance only
// and is not well justified.
if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
{
- //fprintf(pysamerr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
+ //fprintf(pysam_stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
}
if ( igt==GT_SKIP ) continue;
lk += gl[igt];
npresent++;
- // fprintf(pysamerr," %e", gl[igt]);
+ // fprintf(pysam_stderr," %e", gl[igt]);
}
- // fprintf(pysamerr,"\t\t");
+ // fprintf(pysam_stderr,"\t\t");
double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1; // with missing genotypes Pkij's are different
lk += log(1 - trio_Pm * (1 - Pkij));
- // fprintf(pysamerr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij);
+ // fprintf(pysam_stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij);
if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; }
if ( uc_itr==trio[itr] ) uc_is_mendelian = 1;
}
if ( !uc_is_mendelian )
{
uc_lk += log(1 - trio_Pm);
- // fprintf(pysamerr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
+ // fprintf(pysam_stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; }
}
- // fprintf(pysamerr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
+ // fprintf(pysam_stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
// Set genotypes for father, mother, child and calculate genotype qualities
for (i=0; i<3; i++)
int out_als, nout;
if ( nals > 8*sizeof(out_als) )
{
- fprintf(pysamerr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ fprintf(pysam_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
return 0;
}
nout = mcall_find_best_alleles(call, nals, &out_als);
{
if ( nout>4 )
{
- fprintf(pysamerr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
+ fprintf(pysam_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1);
return 0;
}
mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
-/*
- Copyright (C) 2014 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
- ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ ploidy->sex2dflt[ploidy->nsex-1] = -1;
}
ss = se;
if ( !*se ) error("Could not parse: %s\n", line);
sp->ploidy = strtol(ss,&se,10);
if ( ss==se ) error("Could not parse: %s\n", line);
- if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
- if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
+ if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
+ if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
// Special case, chr="*" stands for a default value
if ( default_ploidy_def )
return 0;
}
+static void _set_defaults(ploidy_t *ploidy, int dflt)
+{
+ int i;
+ if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
+ for (i=0; i<ploidy->nsex; i++)
+ if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
+
+ ploidy->dflt = dflt;
+ if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
+ if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
+}
+
ploidy_t *ploidy_init(const char *fname, int dflt)
{
ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
if ( !pld ) return NULL;
- pld->dflt = pld->min = pld->max = dflt;
+ pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
if ( !pld->idx )
{
ploidy_destroy(pld);
- pld = NULL;
+ return NULL;
}
+ _set_defaults(pld,dflt);
return pld;
}
ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
if ( !pld ) return NULL;
- pld->dflt = pld->min = pld->max = dflt;
+ pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
regidx_insert(pld->idx,NULL);
free(tmp.s);
+ _set_defaults(pld,dflt);
return pld;
}
#include "pysam.h"
-/*
- Copyright (C) 2014 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
- ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ ploidy->sex2dflt[ploidy->nsex-1] = -1;
}
ss = se;
if ( !*se ) error("Could not parse: %s\n", line);
sp->ploidy = strtol(ss,&se,10);
if ( ss==se ) error("Could not parse: %s\n", line);
- if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
- if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
+ if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
+ if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
// Special case, chr="*" stands for a default value
if ( default_ploidy_def )
return 0;
}
+static void _set_defaults(ploidy_t *ploidy, int dflt)
+{
+ int i;
+ if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
+ for (i=0; i<ploidy->nsex; i++)
+ if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
+
+ ploidy->dflt = dflt;
+ if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
+ if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
+}
+
ploidy_t *ploidy_init(const char *fname, int dflt)
{
ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
if ( !pld ) return NULL;
- pld->dflt = pld->min = pld->max = dflt;
+ pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
if ( !pld->idx )
{
ploidy_destroy(pld);
- pld = NULL;
+ return NULL;
}
+ _set_defaults(pld,dflt);
return pld;
}
ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
if ( !pld ) return NULL;
- pld->dflt = pld->min = pld->max = dflt;
+ pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
regidx_insert(pld->idx,NULL);
free(tmp.s);
+ _set_defaults(pld,dflt);
return pld;
}
{
if (n1 == 0 || n1 >= b->n) return -1;
if (b->M != b->n * 2) {
- fprintf(pysamerr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
+ fprintf(pysam_stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
return -1;
}
b->n1 = n1;
void bcf_p1_dump_afs(bcf_p1aux_t *ma)
{
int k;
- fprintf(pysamerr, "[afs]");
+ fprintf(pysam_stderr, "[afs]");
for (k = 0; k <= ma->M; ++k)
- fprintf(pysamerr, " %d:%.3lf", k, ma->afs[ma->M - k]);
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
+ fprintf(pysam_stderr, "\n");
memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
}
#ifndef PYSAM_H
#define PYSAM_H
#include "stdio.h"
-extern FILE * pysamerr;
+extern FILE * pysam_stderr;
+extern FILE * pysam_stdout;
+extern const char * pysam_stdout_fn;
#endif
else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
else {
- fprintf(pysamerr, "The type '%s' not recognised\n", optarg);
+ fprintf(pysam_stderr, "The type '%s' not recognised\n", optarg);
return 1;
}
}
if (optind == argc) {
- fprintf(pysamerr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
- fprintf(pysamerr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n");
- fprintf(pysamerr, " -s INT column number for sequence names (suppressed by -p) [1]\n");
- fprintf(pysamerr, " -b INT column number for region start [4]\n");
- fprintf(pysamerr, " -e INT column number for region end (if no end, set INT to -b) [5]\n");
- fprintf(pysamerr, " -0 specify coordinates are zero-based\n");
- fprintf(pysamerr, " -S INT skip first INT lines [0]\n");
- fprintf(pysamerr, " -c CHAR skip lines starting with CHAR [null]\n");
- fprintf(pysamerr, " -a print all records\n");
- fprintf(pysamerr, " -f force to overwrite existing index\n");
- fprintf(pysamerr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
+ fprintf(pysam_stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n");
+ fprintf(pysam_stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n");
+ fprintf(pysam_stderr, " -b INT column number for region start [4]\n");
+ fprintf(pysam_stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n");
+ fprintf(pysam_stderr, " -0 specify coordinates are zero-based\n");
+ fprintf(pysam_stderr, " -S INT skip first INT lines [0]\n");
+ fprintf(pysam_stderr, " -c CHAR skip lines starting with CHAR [null]\n");
+ fprintf(pysam_stderr, " -a print all records\n");
+ fprintf(pysam_stderr, " -f force to overwrite existing index\n");
+ fprintf(pysam_stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
+ fprintf(pysam_stderr, "\n");
return 1;
}
if (is_all) { // read without random access
BGZF *fp;
s.l = s.m = 0; s.s = 0;
fp = bgzf_open(argv[optind], "r");
- while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s);
+ while (bgzf_getline(fp, '\n', &s) >= 0) fputs(s.s, pysam_stdout) & fputc('\n', pysam_stdout);
bgzf_close(fp);
free(s.s);
} else if (optind + 2 > argc) { // create index
strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
if ((fp = fopen(fn, "rb")) != 0) {
fclose(fp);
- fprintf(pysamerr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
+ fprintf(pysam_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
return 1;
}
}
if ( tbx_index_build(argv[optind], min_shift, &conf) )
{
- fprintf(pysamerr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
+ fprintf(pysam_stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
return 1;
}
} else { // read with random access
for (i = optind + 1; i < argc; ++i) {
hts_itr_t *itr;
if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue;
- while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s);
+ while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) fputs(s.s, pysam_stdout) & fputc('\n', pysam_stdout);
tbx_itr_destroy(itr);
}
free(s.s);
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
- int argc, drop_header, tgts_is_vcf, mark_sites_logic;
+ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic;
}
args_t;
return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
}
}
+static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
+{
+ int i, nmax = 0;
+ for (i=icol_beg; i<icol_end; i++)
+ {
+ char *str = tab->cols[i], *end = str;
+ if ( str[0]=='.' && !str[1] )
+ {
+ // missing value
+ if ( !nmax ) nmax = 1;
+ continue;
+ }
+ int n = 1;
+ while ( *end )
+ {
+ if ( *end==',' ) n++;
+ end++;
+ }
+ if ( nmax<n ) nmax = n;
+ }
+ return nmax;
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+ int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
+ assert( nvals>0 );
+ hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+ int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
+ assert( nvals>0 );
+ hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
+}
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+
+ int i, max_len = 0;
+ for (i=col->icol; i<col->icol+nsmpl; i++)
+ {
+ int len = strlen(tab->cols[i]);
+ if ( max_len < len ) max_len = len;
+ }
+ hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ char *ptr = args->tmps + ismpl*max_len;
+ char *str = tab->cols[icol];
+ i = 0;
+ while ( str[i] )
+ {
+ ptr[i] = str[i];
+ i++;
+ }
+ while ( i<max_len ) ptr[i++] = 0;
+ icol++;
+ }
+ return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
+}
static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int i = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0, force_samples = -1;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
- i++;
+ icol++;
str.l = 0;
kputsn(ss, se-ss, &str);
if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
- else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i;
- else if ( !strcasecmp("POS",str.s) ) args->from_idx = i;
- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i;
- else if ( !strcasecmp("TO",str.s) ) args->to_idx = i;
- else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i;
- else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i;
+ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
+ else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol;
+ else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol;
+ else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol;
+ else if ( !strcasecmp("REF",str.s) ) args->ref_idx = icol;
+ else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key = strdup(str.s);
if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
col->hdr_key = strdup(str.s);
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
col->hdr_key = strdup(str.s);
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- if ( !args->tgts_is_vcf )
- error("Error: FORMAT fields can be carried over from a VCF file only.\n");
-
char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;;
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
- tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
- bcf_hdr_sync(args->hdr_out);
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ if ( args->tgts_is_vcf )
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ }
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = -1;
+ if ( !args->tgts_is_vcf )
+ {
+ col->icol = icol;
+ icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ }
+ else
+ col->icol = -1;
col->replace = replace;
col->hdr_key = strdup(key);
if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
- case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
- case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
- case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_format_int : setter_format_int; break;
+ case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break;
+ case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
}
}
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->hdr_key = strdup(str.s);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
if ( has_fmt_str )
{
- int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header);
+ int n = bcf_hdr_nsamples(args->hdr_out);
+ if ( args->tgts_is_vcf && n<bcf_hdr_nsamples(args->files->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header);
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 )
+ if ( force_samples>=0 && args->tgts_is_vcf )
set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
}
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
- bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
}
if ( args->ref_idx != -1 )
{
- assert( args->ref_idx < tmp->ncols );
- assert( args->alt_idx < tmp->ncols );
+ if ( args->ref_idx >= tmp->ncols )
+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s);
+ if ( args->alt_idx >= tmp->ncols )
+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s);
tmp->nals = 2;
hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
tmp->als[0] = tmp->cols[args->ref_idx];
fprintf(stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
- fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man pagee for details\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man pagee for details)\n");
+ fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
int regions_is_file = 0;
{"header-lines",required_argument,NULL,'h'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
- int argc, drop_header, tgts_is_vcf, mark_sites_logic;
+ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic;
}
args_t;
int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s);
if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) )
{
- fprintf(pysamerr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
+ fprintf(pysam_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
args->nrm--;
}
else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
}
}
+static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
+{
+ int i, nmax = 0;
+ for (i=icol_beg; i<icol_end; i++)
+ {
+ char *str = tab->cols[i], *end = str;
+ if ( str[0]=='.' && !str[1] )
+ {
+ // missing value
+ if ( !nmax ) nmax = 1;
+ continue;
+ }
+ int n = 1;
+ while ( *end )
+ {
+ if ( *end==',' ) n++;
+ end++;
+ }
+ if ( nmax<n ) nmax = n;
+ }
+ return nmax;
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+ int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
+ assert( nvals>0 );
+ hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+ int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
+ assert( nvals>0 );
+ hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
+}
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+
+ int i, max_len = 0;
+ for (i=col->icol; i<col->icol+nsmpl; i++)
+ {
+ int len = strlen(tab->cols[i]);
+ if ( max_len < len ) max_len = len;
+ }
+ hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ char *ptr = args->tmps + ismpl*max_len;
+ char *str = tab->cols[icol];
+ i = 0;
+ while ( str[i] )
+ {
+ ptr[i] = str[i];
+ i++;
+ }
+ while ( i<max_len ) ptr[i++] = 0;
+ icol++;
+ }
+ return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
+}
static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
return; // the same samples in both files
if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
- if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysamerr,"%d sample(s) in common\n", nmatch);
+ if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysam_stderr,"%d sample(s) in common\n", nmatch);
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int i = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0, force_samples = -1;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
- i++;
+ icol++;
str.l = 0;
kputsn(ss, se-ss, &str);
if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
- else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i;
- else if ( !strcasecmp("POS",str.s) ) args->from_idx = i;
- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i;
- else if ( !strcasecmp("TO",str.s) ) args->to_idx = i;
- else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i;
- else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i;
+ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
+ else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol;
+ else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol;
+ else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol;
+ else if ( !strcasecmp("REF",str.s) ) args->ref_idx = icol;
+ else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key = strdup(str.s);
if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
col->hdr_key = strdup(str.s);
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
col->hdr_key = strdup(str.s);
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- if ( !args->tgts_is_vcf )
- error("Error: FORMAT fields can be carried over from a VCF file only.\n");
-
char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;;
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
- tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
- bcf_hdr_sync(args->hdr_out);
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ if ( args->tgts_is_vcf )
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ }
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = -1;
+ if ( !args->tgts_is_vcf )
+ {
+ col->icol = icol;
+ icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ }
+ else
+ col->icol = -1;
col->replace = replace;
col->hdr_key = strdup(key);
if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
- case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
- case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
- case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_format_int : setter_format_int; break;
+ case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break;
+ case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
}
}
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->hdr_key = strdup(str.s);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
if ( has_fmt_str )
{
- int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header);
+ int n = bcf_hdr_nsamples(args->hdr_out);
+ if ( args->tgts_is_vcf && n<bcf_hdr_nsamples(args->files->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header);
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 )
+ if ( force_samples>=0 && args->tgts_is_vcf )
set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
}
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
- bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
}
if ( args->ref_idx != -1 )
{
- assert( args->ref_idx < tmp->ncols );
- assert( args->alt_idx < tmp->ncols );
+ if ( args->ref_idx >= tmp->ncols )
+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s);
+ if ( args->alt_idx >= tmp->ncols )
+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s);
tmp->nals = 2;
hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
tmp->als[0] = tmp->cols[args->ref_idx];
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Annotate and edit VCF/BCF files.\n");
- fprintf(pysamerr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
- fprintf(pysamerr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
- fprintf(pysamerr, " -I, --set-id [+]<format> set ID column, see man pagee for details\n");
- fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true (see man pagee for details)\n");
- fprintf(pysamerr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
- fprintf(pysamerr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(pysamerr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(pysamerr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Annotate and edit VCF/BCF files.\n");
+ fprintf(pysam_stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(pysam_stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
+ fprintf(pysam_stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
+ fprintf(pysam_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
+ fprintf(pysam_stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(pysam_stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(pysam_stderr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
int regions_is_file = 0;
{"header-lines",required_argument,NULL,'h'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
typedef struct
{
int flag; // combination of CF_* flags above
- int output_type, n_threads;
+ int output_type, n_threads, record_cmd_line;
htsFile *bcf_in, *out_fh;
char *bcf_fname, *output_fname;
char **samples; // for subsampling and ploidy
"* * * M 1\n"
"* * * F 0\n"
},
+ { .alias = "1",
+ .about = "Treat all samples as haploid",
+ .ploidy =
+ "* * * * 1\n"
+ },
{
.alias = NULL,
.about = NULL,
if ( args->regions )
{
if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
- error("Failed to read the targets: %s\n", args->regions);
+ error("Failed to read the regions: %s\n", args->regions);
}
if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
}
+ }
+ if ( args->ploidy )
+ {
args->nsex = ploidy_nsex(args->ploidy);
args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
+ if ( !args->nsamples )
+ {
+ args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
+ args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ }
+ }
+ if ( args->nsamples )
+ {
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
else
{
args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
- for (i=0; i<args->nsamples; i++)
- if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
- error("No such sample: %s\n", args->samples[i]);
+ if ( args->samples )
+ {
+ for (i=0; i<args->nsamples; i++)
+ if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
+ error("No such sample: %s\n", args->samples[i]);
+ }
}
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
- bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
bcf_hdr_write(args->out_fh, args->aux.hdr);
if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
int i;
- for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ if ( args->samples )
+ {
+ for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ }
if ( args->aux.fams )
{
for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
fprintf(stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "File format options:\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
args.output_fname = "-";
args.output_type = FT_VCF;
args.n_threads = 0;
+ args.record_cmd_line = 1;
args.aux.trio_Pm_SNPs = 1 - 1e-8;
args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
{"ploidy-file",required_argument,NULL,2},
{"chromosome-X",no_argument,NULL,'X'},
{"chromosome-Y",no_argument,NULL,'Y'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
case 's': args.samples_fname = optarg; break;
case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
case 9 : args.n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args.record_cmd_line = 0; break;
default: usage(&args);
}
}
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
typedef struct
{
int flag; // combination of CF_* flags above
- int output_type, n_threads;
+ int output_type, n_threads, record_cmd_line;
htsFile *bcf_in, *out_fh;
char *bcf_fname, *output_fname;
char **samples; // for subsampling and ploidy
"* * * M 1\n"
"* * * F 0\n"
},
+ { .alias = "1",
+ .about = "Treat all samples as haploid",
+ .ploidy =
+ "* * * * 1\n"
+ },
{
.alias = NULL,
.about = NULL,
char x = *se, *xptr = se; *se = 0;
int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
- if ( ismpl < 0 ) { fprintf(pysamerr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( ismpl < 0 ) { fprintf(pysam_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
ss = se+1;
while ( *ss && isspace(*ss) ) ss++;
if ( args->regions )
{
if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
- error("Failed to read the targets: %s\n", args->regions);
+ error("Failed to read the regions: %s\n", args->regions);
}
if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
if ( args->aux.flag&CALL_CONSTR_TRIO )
{
if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
- fprintf(pysamerr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
+ fprintf(pysam_stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
}
+ }
+ if ( args->ploidy )
+ {
args->nsex = ploidy_nsex(args->ploidy);
args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
+ if ( !args->nsamples )
+ {
+ args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
+ args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ }
+ }
+ if ( args->nsamples )
+ {
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
else
{
args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
- for (i=0; i<args->nsamples; i++)
- if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
- error("No such sample: %s\n", args->samples[i]);
+ if ( args->samples )
+ {
+ for (i=0; i<args->nsamples; i++)
+ if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
+ error("No such sample: %s\n", args->samples[i]);
+ }
}
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
- bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
bcf_hdr_write(args->out_fh, args->aux.hdr);
if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
int i;
- for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ if ( args->samples )
+ {
+ for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ }
if ( args->aux.fams )
{
for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
else
{
- fprintf(pysamerr,"Could not parse \"%s\"\n", str);
+ fprintf(pysam_stderr,"Could not parse \"%s\"\n", str);
exit(1);
}
if ( !*se ) break;
if ( !pld->alias )
{
- fprintf(pysamerr,"Predefined ploidies:\n");
+ fprintf(pysam_stderr,"Predefined ploidies:\n");
pld = ploidy_predefs;
while ( pld->alias )
{
- fprintf(pysamerr,"%s\n .. %s\n\n", pld->alias,pld->about);
+ fprintf(pysam_stderr,"%s\n .. %s\n\n", pld->alias,pld->about);
if ( detailed )
- fprintf(pysamerr,"%s\n", pld->ploidy);
+ fprintf(pysam_stderr,"%s\n", pld->ploidy);
pld++;
}
- fprintf(pysamerr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
- fprintf(pysamerr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
+ fprintf(pysam_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
+ fprintf(pysam_stderr,"\n");
exit(-1);
}
else if ( detailed )
{
- fprintf(pysamerr,"%s", pld->ploidy);
+ fprintf(pysam_stderr,"%s", pld->ploidy);
exit(-1);
}
return ploidy_init_string(pld->ploidy,2);
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n");
- fprintf(pysamerr, " This command replaces the former \"bcftools view\" caller. Some of the original\n");
- fprintf(pysamerr, " functionality has been temporarily lost in the process of transition to htslib,\n");
- fprintf(pysamerr, " but will be added back on popular demand. The original calling model can be\n");
- fprintf(pysamerr, " invoked with the -c option.\n");
- fprintf(pysamerr, "Usage: bcftools call [options] <in.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "File format options:\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(pysamerr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
- fprintf(pysamerr, " --ploidy-file <file> space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --samples <list> list of samples to include [all samples]\n");
- fprintf(pysamerr, " -S, --samples-file <file> PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Input/output options:\n");
- fprintf(pysamerr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
- fprintf(pysamerr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
- fprintf(pysamerr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
- fprintf(pysamerr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
- fprintf(pysamerr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
- fprintf(pysamerr, " -V, --skip-variants <type> skip indels/snps\n");
- fprintf(pysamerr, " -v, --variants-only output variant sites only\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Consensus/variant calling options:\n");
- fprintf(pysamerr, " -c, --consensus-caller the original calling method (conflicts with -m)\n");
- fprintf(pysamerr, " -C, --constrain <str> one of: alleles, trio (see manual)\n");
- fprintf(pysamerr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
- fprintf(pysamerr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
- fprintf(pysamerr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(pysamerr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n");
+ fprintf(pysam_stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n");
+ fprintf(pysam_stderr, " functionality has been temporarily lost in the process of transition to htslib,\n");
+ fprintf(pysam_stderr, " but will be added back on popular demand. The original calling model can be\n");
+ fprintf(pysam_stderr, " invoked with the -c option.\n");
+ fprintf(pysam_stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "File format options:\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
+ fprintf(pysam_stderr, " --ploidy-file <file> space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of samples to include [all samples]\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Input/output options:\n");
+ fprintf(pysam_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
+ fprintf(pysam_stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(pysam_stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(pysam_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
+ fprintf(pysam_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
+ fprintf(pysam_stderr, " -V, --skip-variants <type> skip indels/snps\n");
+ fprintf(pysam_stderr, " -v, --variants-only output variant sites only\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Consensus/variant calling options:\n");
+ fprintf(pysam_stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n");
+ fprintf(pysam_stderr, " -C, --constrain <str> one of: alleles, trio (see manual)\n");
+ fprintf(pysam_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(pysam_stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(pysam_stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(pysam_stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
// todo (and more)
- // fprintf(pysamerr, "\nContrast calling and association test options:\n");
- // fprintf(pysamerr, " -1 INT number of group-1 samples [0]\n");
- // fprintf(pysamerr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
- // fprintf(pysamerr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
- // fprintf(pysamerr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
- fprintf(pysamerr, "\n");
+ // fprintf(pysam_stderr, "\nContrast calling and association test options:\n");
+ // fprintf(pysam_stderr, " -1 INT number of group-1 samples [0]\n");
+ // fprintf(pysam_stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
+ // fprintf(pysam_stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ // fprintf(pysam_stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
+ fprintf(pysam_stderr, "\n");
exit(-1);
}
args.output_fname = "-";
args.output_type = FT_VCF;
args.n_threads = 0;
+ args.record_cmd_line = 1;
args.aux.trio_Pm_SNPs = 1 - 1e-8;
args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
{"ploidy-file",required_argument,NULL,2},
{"chromosome-X",no_argument,NULL,'X'},
{"chromosome-Y",no_argument,NULL,'Y'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
{
case 2 : ploidy_fname = optarg; break;
case 1 : ploidy = optarg; break;
- case 'X': ploidy = "X"; fprintf(pysamerr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
- case 'Y': ploidy = "Y"; fprintf(pysamerr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
+ case 'X': ploidy = "X"; fprintf(pysam_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
+ case 'Y': ploidy = "Y"; fprintf(pysam_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
case 's': args.samples_fname = optarg; break;
case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
case 9 : args.n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args.record_cmd_line = 0; break;
default: usage(&args);
}
}
if ( !ploidy_fname && !ploidy )
{
- fprintf(pysamerr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
+ fprintf(pysam_stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
args.ploidy = ploidy_init_string("",2);
}
args->hmm = hmm_init(args->nstates, args->tprob, 10000);
hmm_init_states(args->hmm, args->iprobs);
- args->summary_fh = stdout;
+ args->summary_fh = pysam_stdout;
if ( args->output_dir )
{
init_sample_files(&args->query_sample, args->output_dir);
char *cmd = msprintf("python %s -p %f", script, th);
int ret = system(cmd);
- if ( ret) fprintf(pysamerr, "The command returned non-zero status %d: %s\n", ret, cmd);
+ if ( ret) fprintf(pysam_stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
free(cmd);
}
cn3_baf /= norm;
#if DBG0
- if ( args->verbose ) fprintf(pysamerr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf);
+ if ( args->verbose ) fprintf(pysam_stderr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf);
#endif
double cn1_lrr = exp(-(lrr + 0.45)*(lrr + 0.45)/smpl->lrr_dev2);
baf_AA_dev2 /= norm_baf_AA_dev2;
if ( baf_dev2 < baf_AA_dev2 ) baf_dev2 = baf_AA_dev2;
double max_mean_cn3 = 0.5 - sqrt(baf_dev2)*1.644854; // R: qnorm(0.95)=1.644854
- //fprintf(pysamerr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3);
+ //fprintf(pysam_stderr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3);
assert( max_mean_cn3>0 );
double new_frac = 1./mean_cn3 - 2;
if ( args->optimize_frac )
{
int niter = 0;
- fprintf(pysamerr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid));
+ fprintf(pysam_stderr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid));
do
{
- fprintf(pysamerr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
+ fprintf(pysam_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
if ( args->control_sample.name )
- fprintf(pysamerr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
+ fprintf(pysam_stderr,"\n");
set_emission_probs(args);
hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites);
}
if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample);
}
- fprintf(pysamerr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
+ fprintf(pysam_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2);
if ( args->control_sample.name )
- fprintf(pysamerr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
+ fprintf(pysam_stderr,"\n");
}
set_emission_probs(args);
double ori_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm));
hmm_run_baum_welch(hmm, args->nsites, args->eprob, args->sites);
double new_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm));
- fprintf(pysamerr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii);
+ fprintf(pysam_stderr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii);
double *tprob = init_tprob_matrix(nstates, 1-new_ii, args->same_prob);
hmm_set_tprob(args->hmm, tprob, 10000);
double *tprob_arr = hmm_get_tprob(hmm);
{
for (j=0; j<nstates; j++)
{
- printf(" %.15f", MAT(tprob_arr,nstates,j,i));
+ fprintf(pysam_stdout, " %.15f", MAT(tprob_arr,nstates,j,i));
}
- printf("\n");
+ fprintf(pysam_stdout, "\n");
}
break;
}
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n");
- fprintf(pysamerr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n");
- fprintf(pysamerr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n");
- fprintf(pysamerr, "Usage: bcftools cnv [OPTIONS] <file.vcf>\n");
- fprintf(pysamerr, "General Options:\n");
- fprintf(pysamerr, " -c, --control-sample <string> optional control sample name to highlight differences\n");
- fprintf(pysamerr, " -f, --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(pysamerr, " -o, --output-dir <path> \n");
- fprintf(pysamerr, " -p, --plot-threshold <float> plot aberrant chromosomes with quality at least 'float'\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --query-sample <string> query samply name\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, "HMM Options:\n");
- fprintf(pysamerr, " -a, --aberrant <float[,float]> fraction of aberrant cells in query and control [1.0,1.0]\n");
- fprintf(pysamerr, " -b, --BAF-weight <float> relative contribution from BAF [1]\n");
- fprintf(pysamerr, " -d, --BAF-dev <float[,float]> expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
- fprintf(pysamerr, " -e, --err-prob <float> uniform error probability [1e-4]\n");
- fprintf(pysamerr, " -k, --LRR-dev <float[,float]> expected LRR deviation [0.2,0.2]\n"); // experimental
- fprintf(pysamerr, " -l, --LRR-weight <float> relative contribution from LRR [0.2]\n");
- fprintf(pysamerr, " -L, --LRR-smooth-win <int> window of LRR moving average smoothing [10]\n");
- fprintf(pysamerr, " -O, --optimize <float> estimate fraction of aberrant cells down to <float> [1.0]\n");
- fprintf(pysamerr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
- fprintf(pysamerr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n");
+ fprintf(pysam_stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n");
+ fprintf(pysam_stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n");
+ fprintf(pysam_stderr, "Usage: bcftools cnv [OPTIONS] <file.vcf>\n");
+ fprintf(pysam_stderr, "General Options:\n");
+ fprintf(pysam_stderr, " -c, --control-sample <string> optional control sample name to highlight differences\n");
+ fprintf(pysam_stderr, " -f, --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(pysam_stderr, " -o, --output-dir <path> \n");
+ fprintf(pysam_stderr, " -p, --plot-threshold <float> plot aberrant chromosomes with quality at least 'float'\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --query-sample <string> query samply name\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, "HMM Options:\n");
+ fprintf(pysam_stderr, " -a, --aberrant <float[,float]> fraction of aberrant cells in query and control [1.0,1.0]\n");
+ fprintf(pysam_stderr, " -b, --BAF-weight <float> relative contribution from BAF [1]\n");
+ fprintf(pysam_stderr, " -d, --BAF-dev <float[,float]> expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
+ fprintf(pysam_stderr, " -e, --err-prob <float> uniform error probability [1e-4]\n");
+ fprintf(pysam_stderr, " -k, --LRR-dev <float[,float]> expected LRR deviation [0.2,0.2]\n"); // experimental
+ fprintf(pysam_stderr, " -l, --LRR-weight <float> relative contribution from LRR [0.2]\n");
+ fprintf(pysam_stderr, " -L, --LRR-smooth-win <int> window of LRR moving average smoothing [10]\n");
+ fprintf(pysam_stderr, " -O, --optimize <float> estimate fraction of aberrant cells down to <float> [1.0]\n");
+ fprintf(pysam_stderr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
+ fprintf(pysam_stderr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
}
cnv_next_line(args, NULL);
create_plots(args);
- fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
destroy_data(args);
free(args);
return 0;
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <htslib/tbx.h> // for hts_get_bgzfp()
#include "bcftools.h"
typedef struct _args_t
{
bcf_srs_t *files;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, record_cmd_line;
bcf_hdr_t *out_hdr;
int *seen_seq;
char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
- int compact_PS, phase_set_changed;
+ int compact_PS, phase_set_changed, naive_concat;
}
args_t;
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
- bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
for (i=0; i<args->nfnames; i++) free(args->fnames[i]);
free(args->fnames);
if ( args->files ) bcf_sr_destroy(args->files);
- if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
- bcf_hdr_destroy(args->out_hdr);
+ if ( args->out_fh )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ }
+ if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr);
free(args->seen_seq);
free(args->start_pos);
free(args->swap_phase);
}
}
+static void naive_concat(args_t *args)
+{
+ // only compressed BCF atm
+ BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
+
+ const size_t page_size = 32768;
+ char *buf = (char*) malloc(page_size);
+ kstring_t tmp = {0,0,0};
+ int i;
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *hts_fp = hts_open(args->fnames[i],"r");
+ if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
+ htsFormat type = *hts_get_format(hts_fp);
+
+ if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+
+ BGZF *fp = hts_get_bgzfp(hts_fp);
+ if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
+ error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
+
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+
+ // Output all non-header data that were read together with the header block
+ int nskip = fp->block_offset;
+ if ( fp->block_length - nskip > 0 )
+ {
+ if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ }
+ if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
+
+
+ // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
+ ssize_t nread, ncached = 0, nwr;
+ const int neof = 28;
+ char cached[neof];
+ while (1)
+ {
+ nread = bgzf_raw_read(fp, buf, page_size);
+
+ // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
+ if ( nread<=0 ) break;
+ if ( nread<=neof ) // last block
+ {
+ if ( ncached )
+ {
+ // flush the part of the cache that won't be needed
+ nwr = bgzf_raw_write(bgzf_out, cached, nread);
+ if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
+
+ // make space in the cache so that we can append to the end
+ if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
+ }
+
+ // fill the cache and check for eof outside this loop
+ memcpy(cached+neof-nread,buf,nread);
+ break;
+ }
+
+ // not the last block, flush the cache if full
+ if ( ncached )
+ {
+ nwr = bgzf_raw_write(bgzf_out, cached, ncached);
+ if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
+ ncached = 0;
+ }
+
+ // fill the cache
+ nread -= neof;
+ memcpy(cached,buf+nread,neof);
+ ncached = neof;
+
+ nwr = bgzf_raw_write(bgzf_out, buf, nread);
+ if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
+ }
+ if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
+ {
+ nwr = bgzf_raw_write(bgzf_out, cached, neof);
+ if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ }
+ if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
+ }
+ free(buf);
+ free(tmp.s);
+ if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+}
+
static void usage(args_t *args)
{
fprintf(stderr, "\n");
fprintf(stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n");
fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n");
- fprintf(stderr, " the -a, --allow-overlaps option is specified.\n");
+ fprintf(stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n");
+ fprintf(stderr, " are concatenated without being recompressed, which is very fast but dangerous\n");
+ fprintf(stderr, " if the BCF headers differ.\n");
fprintf(stderr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->min_PQ = 30;
static struct option loptions[] =
{
+ {"naive",no_argument,NULL,'n'},
{"compact-PS",no_argument,NULL,'c'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"threads",required_argument,NULL,9},
{"file-list",required_argument,NULL,'f'},
{"min-PQ",required_argument,NULL,'q'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0)
{
switch (c) {
case 'c': args->compact_PS = 1; break;
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
+ case 'n': args->naive_concat = 1; break;
case 'a': args->allow_overlaps = 1; break;
case 'l': args->phased_concat = 1; break;
case 'f': args->file_list = optarg; break;
};
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
if ( !args->nfnames ) usage(args);
if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n");
if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
+ if ( args->naive_concat )
+ {
+ if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
+ if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+ naive_concat(args);
+ destroy_data(args);
+ free(args);
+ return 0;
+ }
init_data(args);
concat(args);
destroy_data(args);
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <htslib/tbx.h> // for hts_get_bgzfp()
#include "bcftools.h"
typedef struct _args_t
{
bcf_srs_t *files;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, record_cmd_line;
bcf_hdr_t *out_hdr;
int *seen_seq;
char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
- int compact_PS, phase_set_changed;
+ int compact_PS, phase_set_changed, naive_concat;
}
args_t;
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
- bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
for (i=0; i<args->nfnames; i++) free(args->fnames[i]);
free(args->fnames);
if ( args->files ) bcf_sr_destroy(args->files);
- if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
- bcf_hdr_destroy(args->out_hdr);
+ if ( args->out_fh )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ }
+ if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr);
free(args->seen_seq);
free(args->start_pos);
free(args->swap_phase);
{
if ( !gt_absent_warned )
{
- fprintf(pysamerr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
+ fprintf(pysam_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
gt_absent_warned = 1;
}
continue;
{
if ( !gt_absent_warned )
{
- fprintf(pysamerr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
+ fprintf(pysam_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
gt_absent_warned = 1;
}
continue;
}
}
+static void naive_concat(args_t *args)
+{
+ // only compressed BCF atm
+ BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
+
+ const size_t page_size = 32768;
+ char *buf = (char*) malloc(page_size);
+ kstring_t tmp = {0,0,0};
+ int i;
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *hts_fp = hts_open(args->fnames[i],"r");
+ if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
+ htsFormat type = *hts_get_format(hts_fp);
+
+ if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+
+ BGZF *fp = hts_get_bgzfp(hts_fp);
+ if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
+ error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
+
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+
+ // Output all non-header data that were read together with the header block
+ int nskip = fp->block_offset;
+ if ( fp->block_length - nskip > 0 )
+ {
+ if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ }
+ if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
+
+
+ // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
+ ssize_t nread, ncached = 0, nwr;
+ const int neof = 28;
+ char cached[neof];
+ while (1)
+ {
+ nread = bgzf_raw_read(fp, buf, page_size);
+
+ // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
+ if ( nread<=0 ) break;
+ if ( nread<=neof ) // last block
+ {
+ if ( ncached )
+ {
+ // flush the part of the cache that won't be needed
+ nwr = bgzf_raw_write(bgzf_out, cached, nread);
+ if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
+
+ // make space in the cache so that we can append to the end
+ if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
+ }
+
+ // fill the cache and check for eof outside this loop
+ memcpy(cached+neof-nread,buf,nread);
+ break;
+ }
+
+ // not the last block, flush the cache if full
+ if ( ncached )
+ {
+ nwr = bgzf_raw_write(bgzf_out, cached, ncached);
+ if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
+ ncached = 0;
+ }
+
+ // fill the cache
+ nread -= neof;
+ memcpy(cached,buf+nread,neof);
+ ncached = neof;
+
+ nwr = bgzf_raw_write(bgzf_out, buf, nread);
+ if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
+ }
+ if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
+ {
+ nwr = bgzf_raw_write(bgzf_out, cached, neof);
+ if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ }
+ if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
+ }
+ free(buf);
+ free(tmp.s);
+ if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+}
+
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n");
- fprintf(pysamerr, " columns appearing in the same order. The program can be used, for example, to\n");
- fprintf(pysamerr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
- fprintf(pysamerr, " VCF into one. The input files must be sorted by chr and position. The files\n");
- fprintf(pysamerr, " must be given in the correct order to produce sorted VCF on output unless\n");
- fprintf(pysamerr, " the -a, --allow-overlaps option is specified.\n");
- fprintf(pysamerr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
- fprintf(pysamerr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
- fprintf(pysamerr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
- fprintf(pysamerr, " -D, --remove-duplicates Alias for -d none\n");
- fprintf(pysamerr, " -f, --file-list <file> Read the list of files from a file.\n");
- fprintf(pysamerr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
- fprintf(pysamerr, " -o, --output <file> Write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(pysamerr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
- fprintf(pysamerr, " -r, --regions <region> Restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> Restrict to regions listed in a file\n");
- fprintf(pysamerr, " --threads <int> Number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n");
+ fprintf(pysam_stderr, " columns appearing in the same order. The program can be used, for example, to\n");
+ fprintf(pysam_stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
+ fprintf(pysam_stderr, " VCF into one. The input files must be sorted by chr and position. The files\n");
+ fprintf(pysam_stderr, " must be given in the correct order to produce sorted VCF on output unless\n");
+ fprintf(pysam_stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n");
+ fprintf(pysam_stderr, " are concatenated without being recompressed, which is very fast but dangerous\n");
+ fprintf(pysam_stderr, " if the BCF headers differ.\n");
+ fprintf(pysam_stderr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
+ fprintf(pysam_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
+ fprintf(pysam_stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
+ fprintf(pysam_stderr, " -D, --remove-duplicates Alias for -d none\n");
+ fprintf(pysam_stderr, " -f, --file-list <file> Read the list of files from a file.\n");
+ fprintf(pysam_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
+ fprintf(pysam_stderr, " -o, --output <file> Write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> Restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> Restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " --threads <int> Number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->min_PQ = 30;
static struct option loptions[] =
{
+ {"naive",no_argument,NULL,'n'},
{"compact-PS",no_argument,NULL,'c'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"threads",required_argument,NULL,9},
{"file-list",required_argument,NULL,'f'},
{"min-PQ",required_argument,NULL,'q'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0)
{
switch (c) {
case 'c': args->compact_PS = 1; break;
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
+ case 'n': args->naive_concat = 1; break;
case 'a': args->allow_overlaps = 1; break;
case 'l': args->phased_concat = 1; break;
case 'f': args->file_list = optarg; break;
};
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
if ( !args->nfnames ) usage(args);
if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n");
if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
+ if ( args->naive_concat )
+ {
+ if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
+ if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+ naive_concat(args);
+ destroy_data(args);
+ free(args);
+ return 0;
+ }
init_data(args);
concat(args);
destroy_data(args);
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname;
- int argc, n_threads;
+ int argc, n_threads, record_cmd_line;
};
static void destroy_data(args_t *args)
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
args->header = bcf_hdr_init("w");
bcf_hdr_set_chrs(args->header, args->ref);
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, n;
char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
- bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
bcf_hdr_write(out_fh,hdr);
int32_t *itmp = NULL, nitmp = 0;
fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
fprintf(stderr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
args->outfname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
static struct option loptions[] =
{
{"haplegendsample2vcf",required_argument,NULL,'H'},
{"columns",required_argument,NULL,'c'},
{"fasta-ref",required_argument,NULL,'f'},
+ {"no-version",no_argument,NULL,10},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
break;
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 10 : args->record_cmd_line = 0; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname;
- int argc, n_threads;
+ int argc, n_threads, record_cmd_line;
};
static void destroy_data(args_t *args)
{
float aa,ab,bb;
aa = strtod(tsv->ss, &tsv->se);
- if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse first value of %d-th sample\n", i+1); return -1; }
+ if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; }
tsv->ss = tsv->se+1;
ab = strtod(tsv->ss, &tsv->se);
- if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse second value of %d-th sample\n", i+1); return -1; }
+ if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; }
tsv->ss = tsv->se+1;
bb = strtod(tsv->ss, &tsv->se);
- if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse third value of %d-th sample\n", i+1); return -1; }
+ if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; }
tsv->ss = tsv->se+1;
if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; }
if ( !ss[0] || !ss[1] || !ss[2] ||
(up && (!ss[3] || !ss[4]) ) )
{
- fprintf(pysamerr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]);
+ fprintf(pysam_stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]);
return -1;
}
args->gts[2*i+all] = bcf_int32_vector_end;
break;
default :
- fprintf(pysamerr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
+ fprintf(pysam_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
return -1;
}
if( ss[all*2+up+1]=='*' ) up = up + 1;
if(up && up != 2)
{
- fprintf(pysamerr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
+ fprintf(pysam_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
return -1;
}
}
if ( tsv->ss[(nsamples-1)*4+3+nup] )
{
- fprintf(pysamerr,"nup: %d", nup);
- fprintf(pysamerr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]);
+ fprintf(pysam_stderr,"nup: %d", nup);
+ fprintf(pysam_stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]);
return -1;
}
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
free(args->flt);
tsv_destroy(tsv);
- fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total);
+ fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total);
}
static void haplegendsample_to_vcf(args_t *args)
bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
tsv_destroy(hap_tsv);
tsv_destroy(leg_tsv);
- fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total);
+ fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total);
}
static void hapsample_to_vcf(args_t *args)
bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
free(args->gts);
tsv_destroy(tsv);
- fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total);
+ fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total);
}
static void vcf_to_gensample(args_t *args)
if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0;
if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
- if (gen_fname) fprintf(pysamerr, "Gen file: %s\n", gen_fname);
- if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname);
+ if (gen_fname) fprintf(pysam_stderr, "Gen file: %s\n", gen_fname);
+ if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
if (sample_fname) {
// biallelic required
if ( line->n_allele>2 ) {
if (!non_biallelic)
- fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
non_biallelic++;
continue;
}
nok++;
}
}
- fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
+ fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
if ( str.m ) free(str.s);
if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
- if (hap_fname) fprintf(pysamerr, "Haps file: %s\n", hap_fname);
- if (legend_fname) fprintf(pysamerr, "Legend file: %s\n", legend_fname);
- if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname);
+ if (hap_fname) fprintf(pysam_stderr, "Haps file: %s\n", hap_fname);
+ if (legend_fname) fprintf(pysam_stderr, "Legend file: %s\n", legend_fname);
+ if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
if (sample_fname) {
// biallelic required
if ( line->n_allele>2 ) {
if (!non_biallelic)
- fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
non_biallelic++;
continue;
}
}
nok++;
}
- fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
+ fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
if ( str.m ) free(str.s);
if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno));
if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
- if (hap_fname) fprintf(pysamerr, "Haps file: %s\n", hap_fname);
- if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname);
+ if (hap_fname) fprintf(pysam_stderr, "Haps file: %s\n", hap_fname);
+ if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
if (sample_fname) {
// biallelic required
if ( line->n_allele>2 ) {
if (!non_biallelic)
- fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
+ fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
non_biallelic++;
continue;
}
}
nok++;
}
- fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
+ fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
if ( str.m ) free(str.s);
if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
if (hap_fname) free(hap_fname);
args->header = bcf_hdr_init("w");
bcf_hdr_set_chrs(args->header, args->ref);
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, n;
char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
free(args->str.s);
free(args->gts);
- fprintf(pysamerr,"Rows total: \t%d\n", args->n.total);
- fprintf(pysamerr,"Rows skipped: \t%d\n", args->n.skipped);
- fprintf(pysamerr,"Missing GTs: \t%d\n", args->n.missing);
- fprintf(pysamerr,"Hom RR: \t%d\n", args->n.hom_rr);
- fprintf(pysamerr,"Het RA: \t%d\n", args->n.het_ra);
- fprintf(pysamerr,"Hom AA: \t%d\n", args->n.hom_aa);
- fprintf(pysamerr,"Het AA: \t%d\n", args->n.het_aa);
+ fprintf(pysam_stderr,"Rows total: \t%d\n", args->n.total);
+ fprintf(pysam_stderr,"Rows skipped: \t%d\n", args->n.skipped);
+ fprintf(pysam_stderr,"Missing GTs: \t%d\n", args->n.missing);
+ fprintf(pysam_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+ fprintf(pysam_stderr,"Het RA: \t%d\n", args->n.het_ra);
+ fprintf(pysam_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+ fprintf(pysam_stderr,"Het AA: \t%d\n", args->n.het_aa);
}
static void vcf_to_vcf(args_t *args)
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
- bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
bcf_hdr_write(out_fh,hdr);
int32_t *itmp = NULL, nitmp = 0;
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
- fprintf(pysamerr, " formats details. When specifying output files explicitly instead\n");
- fprintf(pysamerr, " of with <prefix>, one can use '-' for stdout and '.' to suppress.\n");
- fprintf(pysamerr, "Usage: bcftools convert [OPTIONS] <input_file>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "VCF input options:\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --samples <list> list of samples to include\n");
- fprintf(pysamerr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "VCF output options:\n");
- fprintf(pysamerr, " -o, --output <file> output file name [stdout]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
- fprintf(pysamerr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
- fprintf(pysamerr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
- fprintf(pysamerr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
- fprintf(pysamerr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
- fprintf(pysamerr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "gVCF conversion:\n");
- fprintf(pysamerr, " --gvcf2vcf expand gVCF reference blocks\n");
- fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
- fprintf(pysamerr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
- fprintf(pysamerr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
- fprintf(pysamerr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
- fprintf(pysamerr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "HAP/LEGEND/SAMPLE conversion:\n");
- fprintf(pysamerr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
- fprintf(pysamerr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
- fprintf(pysamerr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
- fprintf(pysamerr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "TSV conversion:\n");
- fprintf(pysamerr, " --tsv2vcf <file> \n");
- fprintf(pysamerr, " -c, --columns <string> columns of the input tsv file [ID,CHROM,POS,AA]\n");
- fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(pysamerr, " -s, --samples <list> list of sample names\n");
- fprintf(pysamerr, " -S, --samples-file <file> file of sample names\n");
- fprintf(pysamerr, "\n");
- // fprintf(pysamerr, "PLINK options:\n");
- // fprintf(pysamerr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
- // fprintf(pysamerr, " --tped make tped file instead\n");
- // fprintf(pysamerr, " --bin make binary bed/fam/bim files\n");
- // fprintf(pysamerr, "\n");
- // fprintf(pysamerr, "PBWT options:\n");
- // fprintf(pysamerr, " -b, --pbwt <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
- // fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
+ fprintf(pysam_stderr, " formats details. When specifying output files explicitly instead\n");
+ fprintf(pysam_stderr, " of with <prefix>, one can use '-' for pysam_stdout and '.' to suppress.\n");
+ fprintf(pysam_stderr, "Usage: bcftools convert [OPTIONS] <input_file>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "VCF input options:\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(pysam_stderr, " -i, --include <expr> select sites for which the expression is true\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of samples to include\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "VCF output options:\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output <file> output file name [pysam_stdout]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
+ fprintf(pysam_stderr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
+ fprintf(pysam_stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
+ fprintf(pysam_stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
+ fprintf(pysam_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysam_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "gVCF conversion:\n");
+ fprintf(pysam_stderr, " --gvcf2vcf expand gVCF reference blocks\n");
+ fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
+ fprintf(pysam_stderr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
+ fprintf(pysam_stderr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
+ fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "HAP/LEGEND/SAMPLE conversion:\n");
+ fprintf(pysam_stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
+ fprintf(pysam_stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
+ fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "TSV conversion:\n");
+ fprintf(pysam_stderr, " --tsv2vcf <file> \n");
+ fprintf(pysam_stderr, " -c, --columns <string> columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of sample names\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> file of sample names\n");
+ fprintf(pysam_stderr, "\n");
+ // fprintf(pysam_stderr, "PLINK options:\n");
+ // fprintf(pysam_stderr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
+ // fprintf(pysam_stderr, " --tped make tped file instead\n");
+ // fprintf(pysam_stderr, " --bin make binary bed/fam/bim files\n");
+ // fprintf(pysam_stderr, "\n");
+ // fprintf(pysam_stderr, "PBWT options:\n");
+ // fprintf(pysam_stderr, " -b, --pbwt <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
+ // fprintf(pysam_stderr, "\n");
exit(1);
}
args->outfname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
static struct option loptions[] =
{
{"haplegendsample2vcf",required_argument,NULL,'H'},
{"columns",required_argument,NULL,'c'},
{"fasta-ref",required_argument,NULL,'f'},
+ {"no-version",no_argument,NULL,10},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
break;
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 10 : args->record_cmd_line = 0; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
int output_type, n_threads;
char **argv, *output_fname, *targets_list, *regions_list;
- int argc;
+ int argc, record_cmd_line;
}
args_t;
}
}
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
fprintf(stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int regions_is_file = 0, targets_is_file = 0;
static struct option loptions[] =
{"threads",required_argument,NULL,9},
{"SnpGap",required_argument,NULL,'g'},
{"IndelGap",required_argument,NULL,'G'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
else error("The argument to -S not recognised: %s\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage(args);
default: error("Unknown argument: %s\n", optarg);
int output_type, n_threads;
char **argv, *output_fname, *targets_list, *regions_list;
- int argc;
+ int argc, record_cmd_line;
}
args_t;
if ( tmp.s ) kputs(" and ", &tmp);
kputs("\"IndelGap\"", &tmp);
}
- fprintf(pysamerr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
free(tmp.s);
}
}
}
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Apply fixed-threshold filters.\n");
- fprintf(pysamerr, "Usage: bcftools filter [options] <in.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -g, --SnpGap <int> filter SNPs within <int> base pairs of an indel\n");
- fprintf(pysamerr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
- fprintf(pysamerr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
- fprintf(pysamerr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --soft-filter <string> annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
- fprintf(pysamerr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Apply fixed-threshold filters.\n");
+ fprintf(pysam_stderr, "Usage: bcftools filter [options] <in.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -g, --SnpGap <int> filter SNPs within <int> base pairs of an indel\n");
+ fprintf(pysam_stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
+ fprintf(pysam_stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
+ fprintf(pysam_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --soft-filter <string> annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
+ fprintf(pysam_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int regions_is_file = 0, targets_is_file = 0;
static struct option loptions[] =
{"threads",required_argument,NULL,9},
{"SnpGap",required_argument,NULL,'g'},
{"IndelGap",required_argument,NULL,'G'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
else error("The argument to -S not recognised: %s\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage(args);
default: error("Unknown argument: %s\n", optarg);
int len = strlen(script);
char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
int ret = system(cmd);
- if ( ret ) fprintf(pysamerr, "The command returned non-zero status %d: %s\n", ret, cmd);
+ if ( ret ) fprintf(pysam_stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
free(cmd);
}
gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
}
}
- //for (i=0; i<n_gt2ipl; i++) printf("%d .. %d\n", i,gt2ipl[i]);
+ //for (i=0; i<n_gt2ipl; i++) fprintf(pysam_stdout, "%d .. %d\n", i,gt2ipl[i]);
return 1;
}
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
if ( !args->no_PLs )
- fprintf(pysamerr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
+ fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
fake_pls = 1;
}
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
+ FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout;
print_header(args, fp);
int tgt_isample = -1, query_isample = 0;
{
if ( tgt_isample==-1 )
{
- fprintf(pysamerr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
+ fprintf(pysam_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
tgt_isample = 0;
}
}
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
if ( !args->no_PLs )
- fprintf(pysamerr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
+ fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
fake_pls = 1;
}
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
+ FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout;
print_header(args, fp);
if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
if ( args->tmp_arr ) free(args->tmp_arr);
if ( is_hom ) free(is_hom);
- if ( pl_warned ) fprintf(pysamerr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
- if ( dp_warned ) fprintf(pysamerr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+ if ( pl_warned ) fprintf(pysam_stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
+ if ( dp_warned ) fprintf(pysam_stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
// Output samples sorted by average discordance
double *score = (double*) calloc(nsamples,sizeof(double));
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n");
- fprintf(pysamerr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -a, --all-sites output comparison for all sites\n");
- fprintf(pysamerr, " -g, --genotypes <file> genotypes to compare against\n");
- fprintf(pysamerr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
- fprintf(pysamerr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
- fprintf(pysamerr, " -p, --plot <prefix> plot\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --query-sample <string> query sample (by default the first sample is checked)\n");
- fprintf(pysamerr, " -S, --target-sample <string> target sample in the -g file (used only for plotting)\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n");
+ fprintf(pysam_stderr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(pysam_stderr, " -g, --genotypes <file> genotypes to compare against\n");
+ fprintf(pysam_stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
+ fprintf(pysam_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
+ fprintf(pysam_stderr, " -p, --plot <prefix> plot\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --query-sample <string> query sample (by default the first sample is checked)\n");
+ fprintf(pysam_stderr, " -S, --target-sample <string> target sample in the -g file (used only for plotting)\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
if (stats) return vcf_index_stats(fname, stats);
htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) error("Failed to read %s\n", fname);
htsFormat type = *hts_get_format(fp);
hts_close(fp);
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Index bgzip compressed VCF/BCF files for random access.\n");
- fprintf(pysamerr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Indexing options:\n");
- fprintf(pysamerr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
- fprintf(pysamerr, " -f, --force overwrite index if it already exists\n");
- fprintf(pysamerr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(pysamerr, " -t, --tbi generate TBI-format index for VCF files\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Stats options:\n");
- fprintf(pysamerr, " -n, --nrecords print number of records based on existing index file\n");
- fprintf(pysamerr, " -s, --stats print per contig stats based on existing index file\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Index bgzip compressed VCF/BCF files for random access.\n");
+ fprintf(pysam_stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Indexing options:\n");
+ fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n");
+ fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Stats options:\n");
+ fprintf(pysam_stderr, " -n, --nrecords print number of records based on existing index file\n");
+ fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
{
char *fn_out = NULL;
FILE *out;
- out = fn_out ? fopen(fn_out, "w") : stdout;
+ out = fn_out ? fopen(fn_out, "w") : pysam_stdout;
const char **seq;
int i, nseq;
hts_idx_t *idx = NULL;
htsFile *fp = hts_open(fname,"r");
- if ( !fp ) { fprintf(pysamerr,"Could not read %s\n", fname); return 1; }
+ if ( !fp ) { fprintf(pysam_stderr,"Could not read %s\n", fname); return 1; }
bcf_hdr_t *hdr = bcf_hdr_read(fp);
- if ( !hdr ) { fprintf(pysamerr,"Could not read the header: %s\n", fname); return 1; }
+ if ( !hdr ) { fprintf(pysam_stderr,"Could not read the header: %s\n", fname); return 1; }
if ( hts_get_format(fp)->format==vcf )
{
tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(pysamerr,"Could not load TBI index: %s\n", fname); return 1; }
+ if ( !tbx ) { fprintf(pysam_stderr,"Could not load TBI index: %s\n", fname); return 1; }
}
else if ( hts_get_format(fp)->format==bcf )
{
idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(pysamerr,"Could not load CSI index: %s\n", fname); return 1; }
+ if ( !idx ) { fprintf(pysam_stderr,"Could not load CSI index: %s\n", fname); return 1; }
}
else
{
- fprintf(pysamerr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+ fprintf(pysam_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
return 1;
}
bcf1_t *rec = bcf_init1();
if (bcf_read1(fp, hdr, rec) >= 0)
{
- fprintf(pysamerr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ fprintf(pysam_stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
return 1;
}
bcf_destroy1(rec);
if ( optind==argc ) usage();
if (stats>2)
{
- fprintf(pysamerr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
+ fprintf(pysam_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
return 1;
}
if (tbi && min_shift>0)
{
- fprintf(pysamerr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
+ fprintf(pysam_stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
return 1;
}
if (min_shift < 0 || min_shift > 30)
{
- fprintf(pysamerr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
+ fprintf(pysam_stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
return 1;
}
if (stats) return vcf_index_stats(fname, stats);
htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) error("Failed to read %s\n", fname);
htsFormat type = *hts_get_format(fp);
hts_close(fp);
if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
{
- fprintf(pysamerr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
+ fprintf(pysam_stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
if ( type.compression!=bgzf )
- fprintf(pysamerr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
+ fprintf(pysam_stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
return 1;
}
if (tbi && type.format==bcf)
{
- fprintf(pysamerr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
+ fprintf(pysam_stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
tbi = 0; min_shift = BCF_LIDX_SHIFT;
}
if (min_shift == 0 && type.format==bcf)
{
- fprintf(pysamerr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
+ fprintf(pysam_stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
return 1;
}
if (!tbi && type.format==vcf && min_shift == 0)
{
- fprintf(pysamerr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
+ fprintf(pysam_stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
tbi = 1;
}
stat(fname, &stat_file);
if ( stat_file.st_mtime <= stat_tbi.st_mtime )
{
- fprintf(pysamerr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
return 1;
}
}
{
if ( bcf_index_build(fname, min_shift) != 0 )
{
- fprintf(pysamerr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
+ fprintf(pysam_stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
return 1;
}
}
{
if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
{
- fprintf(pysamerr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
+ fprintf(pysam_stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
return 1;
}
}
htsFile **fh_out;
char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
char *isec_exact;
- int argc;
+ int argc, record_cmd_line;
}
args_t;
out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
- bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
+ if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
bcf_hdr_write(out_fh, files->readers[args->iwrite].header);
}
if ( !args->nwrite && !out_std && !args->prefix )
args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
- bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
+ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \
}
if ( !args->nwrite || args->write[0] )
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
args->output_fname = NULL;
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
}
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
htsFile **fh_out;
char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
char *isec_exact;
- int argc;
+ int argc, record_cmd_line;
}
args_t;
kstring_t str = {0,0,0};
htsFile *out_fh = NULL;
- // When only one VCF is output, print VCF to stdout or -o file
+ // When only one VCF is output, print VCF to pysam_stdout or -o file
int out_std = 0;
if ( args->nwrite==1 && !args->prefix ) out_std = 1;
if ( args->targets_list && files->nreaders==1 ) out_std = 1;
out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
- bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
+ if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
bcf_hdr_write(out_fh, files->readers[args->iwrite].header);
}
if ( !args->nwrite && !out_std && !args->prefix )
- fprintf(pysamerr,"Note: -w option not given, printing list of sites...\n");
+ fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n");
int n;
while ( (n=bcf_sr_next_line(files)) )
args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
- bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
+ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \
}
if ( !args->nwrite || args->write[0] )
if ( args->fh_sites == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
}
else
- args->fh_sites = stdout;
+ args->fh_sites = pysam_stdout;
}
}
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Create intersections, unions and complements of VCF files.\n");
- fprintf(pysamerr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
- fprintf(pysamerr, " -C, --complement output positions present only in the first file but missing in the others\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(pysamerr, " -i, --include <expr> include only sites for which the expression is true\n");
- fprintf(pysamerr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(pysamerr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Examples:\n");
- fprintf(pysamerr, " # Create intersection and complements of two sets saving the output in dir/*\n");
- fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, " # Filter sites in A and B (but not in C) and create intersection\n");
- fprintf(pysamerr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, " # Extract and write records from A shared by both A and B using exact allele match\n");
- fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, " # Extract records private to A or B comparing by position only\n");
- fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Create intersections, unions and complements of VCF files.\n");
+ fprintf(pysam_stderr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(pysam_stderr, " -C, --complement output positions present only in the first file but missing in the others\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(pysam_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysam_stderr, " -i, --include <expr> include only sites for which the expression is true\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Examples:\n");
+ fprintf(pysam_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
+ fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, " # Filter sites in A and B (but not in C) and create intersection\n");
+ fprintf(pysam_stderr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, " # Extract and write records from A shared by both A and B using exact allele match\n");
+ fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, " # Extract records private to A or B comparing by position only\n");
+ fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = NULL;
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
}
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
htsFile *out_fh;
bcf_hdr_t *out_hdr;
char **argv;
- int argc, n_threads;
+ int argc, n_threads, record_cmd_line;
}
args_t;
}
if ( ith_src!=isrc ) return -1; // requested field not found
int end_src = start_src;
- while ( end_src<src_len && src[end_src]!=',' ) end_src++;
+ while ( end_src<src_len && src[end_src] && src[end_src]!=',' ) end_src++;
int nsrc_cpy = end_src - start_src;
if ( nsrc_cpy==1 && src[start_src]=='.' ) return 0; // don't write missing values, dst is already initialized
char buf[10]; snprintf(buf,10,"%d",i+1);
merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
}
- bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
bcf_hdr_sync(args->out_hdr);
}
info_rules_init(args);
fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->collapse = COLLAPSE_BOTH;
int regions_is_file = 0;
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
case 2 : args->header_only = 1; break;
case 3 : args->force_samples = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
htsFile *out_fh;
bcf_hdr_t *out_hdr;
char **argv;
- int argc, n_threads;
+ int argc, n_threads, record_cmd_line;
}
args_t;
void debug_als(char **als, int nals)
{
- int k; for (k=0; k<nals; k++) fprintf(pysamerr,"%s ", als[k]);
- fprintf(pysamerr,"\n");
+ int k; for (k=0; k<nals; k++) fprintf(pysam_stderr,"%s ", als[k]);
+ fprintf(pysam_stderr,"\n");
}
/**
{
if ( strncasecmp(a[0],b[0],rla<rlb?rla:rlb) )
{
- fprintf(pysamerr, "The REF prefixes differ: %s vs %s (%d,%d)\n", a[0],b[0],rla,rlb);
+ fprintf(pysam_stderr, "The REF prefixes differ: %s vs %s (%d,%d)\n", a[0],b[0],rla,rlb);
return NULL;
}
// Different case, change to uppercase
}
void maux_debug(maux_t *ma, int ir, int ib)
{
- printf("[%d,%d]\t", ir,ib);
+ fprintf(pysam_stdout, "[%d,%d]\t", ir,ib);
int i;
for (i=0; i<ma->nals; i++)
{
- printf(" %s [%d]", ma->als[i], ma->cnt[i]);
+ fprintf(pysam_stdout, " %s [%d]", ma->als[i], ma->cnt[i]);
}
- printf("\n");
+ fprintf(pysam_stdout, "\n");
}
void merge_chrom2qual(args_t *args, bcf1_t *out)
}
if ( ith_src!=isrc ) return -1; // requested field not found
int end_src = start_src;
- while ( end_src<src_len && src[end_src]!=',' ) end_src++;
+ while ( end_src<src_len && src[end_src] && src[end_src]!=',' ) end_src++;
int nsrc_cpy = end_src - start_src;
if ( nsrc_cpy==1 && src[start_src]=='.' ) return 0; // don't write missing values, dst is already initialized
case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
- default: fprintf(pysamerr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ default: fprintf(pysam_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
}
#undef BRANCH
}
case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
- default: fprintf(pysamerr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ default: fprintf(pysam_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
}
#undef BRANCH
}
if ( !reader->buffer ) return;
int i;
- // FILE *fp = stdout;
+ // FILE *fp = pysam_stdout;
// fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
// debug_buffer(fp,reader);
// fprintf(fp,"--\n");
maux_t *maux = args->maux;
int j,k,l;
- fprintf(pysamerr,"Alleles to merge at %d\n", pos+1);
+ fprintf(pysam_stderr,"Alleles to merge at %d\n", pos+1);
for (j=0; j<files->nreaders; j++)
{
bcf_sr_t *reader = &files->readers[j];
- fprintf(pysamerr," reader %d: ", j);
+ fprintf(pysam_stderr," reader %d: ", j);
for (k=0; k<=reader->nbuffer; k++)
{
if ( maux->d[j][k].skip==SKIP_DONE ) continue;
bcf1_t *line = reader->buffer[k];
if ( line->pos!=pos ) continue;
- fprintf(pysamerr,"\t");
- if ( maux->d[j][k].skip ) fprintf(pysamerr,"["); // this record will not be merged in this round
+ fprintf(pysam_stderr,"\t");
+ if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round
for (l=0; l<line->n_allele; l++)
- fprintf(pysamerr,"%s%s", l==0?"":",", line->d.allele[l]);
- if ( maux->d[j][k].skip ) fprintf(pysamerr,"]");
+ fprintf(pysam_stderr,"%s%s", l==0?"":",", line->d.allele[l]);
+ if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"]");
}
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"\n");
}
- fprintf(pysamerr," counts: ");
- for (j=0; j<maux->nals; j++) fprintf(pysamerr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr," counts: ");
+ for (j=0; j<maux->nals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysam_stderr,"\n");
for (j=0; j<files->nreaders; j++)
{
bcf_sr_t *reader = &files->readers[j];
- fprintf(pysamerr," out %d: ", j);
+ fprintf(pysam_stderr," out %d: ", j);
for (k=0; k<=reader->nbuffer; k++)
{
if ( maux->d[j][k].skip==SKIP_DONE ) continue;
bcf1_t *line = reader->buffer[k];
if ( line->pos!=pos ) continue;
if ( maux->d[j][k].skip ) continue;
- fprintf(pysamerr,"\t");
+ fprintf(pysam_stderr,"\t");
for (l=0; l<line->n_allele; l++)
- fprintf(pysamerr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
+ fprintf(pysam_stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
}
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"\n");
}
- fprintf(pysamerr,"\n");
+ fprintf(pysam_stderr,"\n");
}
// Determine which line should be merged from which reader: go through all
char buf[10]; snprintf(buf,10,"%d",i+1);
merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
}
- bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
bcf_hdr_sync(args->out_hdr);
}
info_rules_init(args);
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n");
- fprintf(pysamerr, " Note that only records from different files can be merged, never from the same file. For\n");
- fprintf(pysamerr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n");
- fprintf(pysamerr, "Usage: bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " --force-samples resolve duplicate sample names\n");
- fprintf(pysamerr, " --print-header print only the merged header and exit\n");
- fprintf(pysamerr, " --use-header <file> use the provided header\n");
- fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(pysamerr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
- fprintf(pysamerr, " -l, --file-list <file> read file names from the file\n");
- fprintf(pysamerr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n");
+ fprintf(pysam_stderr, " Note that only records from different files can be merged, never from the same file. For\n");
+ fprintf(pysam_stderr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n");
+ fprintf(pysam_stderr, "Usage: bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " --force-samples resolve duplicate sample names\n");
+ fprintf(pysam_stderr, " --print-header print only the merged header and exit\n");
+ fprintf(pysam_stderr, " --use-header <file> use the provided header\n");
+ fprintf(pysam_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysam_stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
+ fprintf(pysam_stderr, " -l, --file-list <file> read file names from the file\n");
+ fprintf(pysam_stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->collapse = COLLAPSE_BOTH;
int regions_is_file = 0;
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
case 2 : args->header_only = 1; break;
case 3 : args->force_samples = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+ int record_cmd_line;
}
args_t;
if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
}
-
// trim from right
int ori_pos = line->pos;
while (1)
{
// is the rightmost base identical in all alleles?
+ int min_len = als[0].l;
for (i=1; i<line->n_allele; i++)
{
if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break;
+ if ( als[i].l < min_len ) min_len = als[i].l;
}
if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+ if ( min_len<=1 && line->pos==0 ) break;
int pad_from_left = 0;
for (i=0; i<line->n_allele; i++) // trim all alleles
if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
}
- if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
ntrim_left++;
}
if ( ntrim_left )
{
kstring_t *tmp = &args->tmp_str[i];
kputsn(tmp->s,tmp->l,&str);
- for (j=tmp->l; j<max_len; j++) kputc(0,tmp);
+ for (j=tmp->l; j<max_len; j++) kputc('\0',&str);
}
args->ntmp_arr2 = str.m;
args->tmp_arr2 = (uint8_t*)str.s;
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out, args->n_threads);
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
int prev_rid = -1, prev_pos = -1, prev_type = 0;
if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
j++;
}
- if ( args->rbuf.n==args->rbuf.m ) j = 1;
if ( j>0 ) flush_buffer(args, out, j);
}
flush_buffer(args, out, args->rbuf.n);
fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence\n");
fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->aln_win = 100;
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
{"threads",required_argument,NULL,9},
{"check-ref",required_argument,NULL,'c'},
{"strict-filter",no_argument,NULL,'s'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+ int record_cmd_line;
}
args_t;
if ( args->check_ref==CHECK_REF_EXIT )
error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
if ( args->check_ref & CHECK_REF_WARN )
- fprintf(pysamerr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ fprintf(pysam_stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
free(ref);
return ERR_REF_MISMATCH;
}
if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
}
-
// trim from right
int ori_pos = line->pos;
while (1)
{
// is the rightmost base identical in all alleles?
+ int min_len = als[0].l;
for (i=1; i<line->n_allele; i++)
{
if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break;
+ if ( als[i].l < min_len ) min_len = als[i].l;
}
if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+ if ( min_len<=1 && line->pos==0 ) break;
int pad_from_left = 0;
for (i=0; i<line->n_allele; i++) // trim all alleles
if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
}
- if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
ntrim_left++;
}
if ( ntrim_left )
{ \
/* expecting diploid gt in INFO */ \
if (nvals_ori!=lines[0]->n_allele*(lines[0]->n_allele+1)/2) { \
- fprintf(pysamerr, "todo: merge Number=G INFO fields for haploid sites\n"); \
+ fprintf(pysam_stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \
error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \
} \
int nvals = dst->n_allele*(dst->n_allele+1)/2; \
{
kstring_t *tmp = &args->tmp_str[i];
kputsn(tmp->s,tmp->l,&str);
- for (j=tmp->l; j<max_len; j++) kputc(0,tmp);
+ for (j=tmp->l; j<max_len; j++) kputc('\0',&str);
}
args->ntmp_arr2 = str.m;
args->tmp_arr2 = (uint8_t*)str.s;
else if ( args->check_ref==CHECK_REF_EXIT )
error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1);
else if ( args->check_ref & CHECK_REF_WARN )
- fprintf(pysamerr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+ fprintf(pysam_stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1);
}
}
}
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out, args->n_threads);
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
int prev_rid = -1, prev_pos = -1, prev_type = 0;
if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
j++;
}
- if ( args->rbuf.n==args->rbuf.m ) j = 1;
if ( j>0 ) flush_buffer(args, out, j);
}
flush_buffer(args, out, args->rbuf.n);
hts_close(out);
- fprintf(pysamerr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
+ fprintf(pysam_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
if ( args->check_ref & CHECK_REF_FIX )
- fprintf(pysamerr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
+ fprintf(pysam_stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
}
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n");
- fprintf(pysamerr, " split multiallelic sites into multiple rows; recover multiallelics from\n");
- fprintf(pysamerr, " multiple rows.\n");
- fprintf(pysamerr, "Usage: bcftools norm [options] <in.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
- fprintf(pysamerr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
- fprintf(pysamerr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
- fprintf(pysamerr, " -f, --fasta-ref <file> reference sequence\n");
- fprintf(pysamerr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
- fprintf(pysamerr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n");
+ fprintf(pysam_stderr, " split multiallelic sites into multiple rows; recover multiallelics from\n");
+ fprintf(pysam_stderr, " multiple rows.\n");
+ fprintf(pysam_stderr, "Usage: bcftools norm [options] <in.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+ fprintf(pysam_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
+ fprintf(pysam_stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
+ fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(pysam_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->aln_win = 100;
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
{"threads",required_argument,NULL,9},
{"check-ref",required_argument,NULL,'c'},
{"strict-filter",no_argument,NULL,'s'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 'o': args->output_fname = optarg; break;
case 'D':
- fprintf(pysamerr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n");
+ fprintf(pysam_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n");
args->rmdup = COLLAPSE_NONE<<1;
break;
case 's': args->strict_filter = 1; break;
if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
char **plugin_paths;
char **argv, *output_fname, *regions_list, *targets_list;
- int argc, drop_header, verbose;
+ int argc, drop_header, verbose, record_cmd_line;
}
args_t;
fprintf(stderr,
" in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
"- Is the plugin path correct?\n\n"
- "- Are all shared libraries, namely libhts.so, accessible? Verify with\n"
- " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n"
- " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n"
- "\n"
- "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n"
- "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n"
- "\n"
"- Run \"bcftools plugin -lv\" for more detailed error output.\n"
"\n",
getenv("BCFTOOLS_PLUGINS")
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->nplugin_paths = -1;
int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
if ( argc==1 ) usage(args);
+
char *plugin_name = NULL;
- if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; }
+ if ( argv[1][0]!='-' )
+ {
+ plugin_name = argv[1];
+ argc--;
+ argv++;
+ load_plugin(args, plugin_name, 1, &args->plugin);
+ if ( args->plugin.run )
+ {
+ int ret = args->plugin.run(argc, argv);
+ destroy_data(args);
+ free(args);
+ return ret;
+ }
+ }
static struct option loptions[] =
{
{"regions-file",required_argument,NULL,'R'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'l': plist_only = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?':
case 'h': usage_only = 1; break;
default: error("Unknown argument: %s\n", optarg);
if ( plist_only ) return list_plugins(args);
if ( usage_only && ! plugin_name ) usage(args);
- load_plugin(args, plugin_name, 1, &args->plugin);
if ( version_only )
{
const char *bver, *hver;
return 0;
}
- if ( args->plugin.run )
- {
- int iopt = optind; optind = 0;
- int ret = args->plugin.run(argc-iopt, argv+iopt);
- destroy_data(args);
- free(args);
- return ret;
- }
-
char *fname = NULL;
if ( optind>=argc || argv[optind][0]=='-' )
{
char **plugin_paths;
char **argv, *output_fname, *regions_list, *targets_list;
- int argc, drop_header, verbose;
+ int argc, drop_header, verbose, record_cmd_line;
}
args_t;
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose ) fprintf(pysamerr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir);
}
else
{
- if ( args->verbose ) fprintf(pysamerr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
free(dir);
}
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
if ( args->verbose )
{
- if ( !handle ) fprintf(pysamerr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
- else fprintf(pysamerr,"%s:\n\tdlopen .. ok\n", tmp);
+ if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
+ else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", tmp);
}
free(tmp);
if ( handle ) return handle;
handle = dlopen(fname, RTLD_NOW);
if ( args->verbose )
{
- if ( !handle ) fprintf(pysamerr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
- else fprintf(pysamerr,"%s:\n\tdlopen .. ok\n", fname);
+ if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
+ else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", fname);
}
return handle;
static void print_plugin_usage_hint(void)
{
- fprintf(pysamerr, "\nNo functional bcftools plugins were found");
+ fprintf(pysam_stderr, "\nNo functional bcftools plugins were found");
if ( !getenv("BCFTOOLS_PLUGINS") )
- fprintf(pysamerr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+ fprintf(pysam_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
else
- fprintf(pysamerr,
+ fprintf(pysam_stderr,
" in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
"- Is the plugin path correct?\n\n"
- "- Are all shared libraries, namely libhts.so, accessible? Verify with\n"
- " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n"
- " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n"
- "\n"
- "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n"
- "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n"
- "\n"
"- Run \"bcftools plugin -lv\" for more detailed error output.\n"
"\n",
getenv("BCFTOOLS_PLUGINS")
if ( ret )
plugin->init = NULL;
else
- if ( args->verbose ) fprintf(pysamerr,"\tinit .. ok\n");
+ if ( args->verbose ) fprintf(pysam_stderr,"\tinit .. ok\n");
plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
ret = dlerror();
if ( ret )
plugin->run = NULL;
else
- if ( args->verbose ) fprintf(pysamerr,"\trun .. ok\n");
+ if ( args->verbose ) fprintf(pysam_stderr,"\trun .. ok\n");
if ( !plugin->init && !plugin->run )
{
if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
- else if ( args->verbose ) fprintf(pysamerr,"\tinit/run .. not found\n");
+ else if ( args->verbose ) fprintf(pysam_stderr,"\tinit/run .. not found\n");
return -1;
}
if ( ret )
{
if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
- else if ( args->verbose ) fprintf(pysamerr,"\tversion .. not found\n");
+ else if ( args->verbose ) fprintf(pysam_stderr,"\tversion .. not found\n");
return -1;
}
args->plugin.version(&bver, &hver);
if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
{
- fprintf(pysamerr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver);
+ fprintf(pysam_stderr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver);
warned_bcftools = 1;
}
if ( strcmp(hver,hts_version()) && !warned_htslib )
{
- fprintf(pysamerr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
+ fprintf(pysam_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
warned_htslib = 1;
}
args->drop_header += ret;
qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
for (i=0; i<nplugins; i++)
- printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
- printf("\n");
+ fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ fprintf(pysam_stdout, "\n");
}
else
print_plugin_usage_hint();
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Run user defined plugin\n");
- fprintf(pysamerr, "Usage: bcftools plugin <name> [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
- fprintf(pysamerr, " bcftools +name [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "VCF input options:\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, "VCF output options:\n");
- fprintf(pysamerr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(pysamerr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "Plugin options:\n");
- fprintf(pysamerr, " -h, --help list plugin's options\n");
- fprintf(pysamerr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(pysamerr, " -v, --verbose print debugging information on plugin failure\n");
- fprintf(pysamerr, " -V, --version print version string and exit\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Run user defined plugin\n");
+ fprintf(pysam_stderr, "Usage: bcftools plugin <name> [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
+ fprintf(pysam_stderr, " bcftools +name [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "VCF input options:\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
+ fprintf(pysam_stderr, " -i, --include <expr> select sites for which the expression is true\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, "VCF output options:\n");
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "Plugin options:\n");
+ fprintf(pysam_stderr, " -h, --help list plugin's options\n");
+ fprintf(pysam_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
+ fprintf(pysam_stderr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(pysam_stderr, " -V, --version print version string and exit\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->nplugin_paths = -1;
int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
if ( argc==1 ) usage(args);
+
char *plugin_name = NULL;
- if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; }
+ if ( argv[1][0]!='-' )
+ {
+ plugin_name = argv[1];
+ argc--;
+ argv++;
+ load_plugin(args, plugin_name, 1, &args->plugin);
+ if ( args->plugin.run )
+ {
+ int ret = args->plugin.run(argc, argv);
+ destroy_data(args);
+ free(args);
+ return ret;
+ }
+ }
static struct option loptions[] =
{
{"regions-file",required_argument,NULL,'R'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'l': plist_only = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?':
case 'h': usage_only = 1; break;
default: error("Unknown argument: %s\n", optarg);
if ( plist_only ) return list_plugins(args);
if ( usage_only && ! plugin_name ) usage(args);
- load_plugin(args, plugin_name, 1, &args->plugin);
if ( version_only )
{
const char *bver, *hver;
args->plugin.version(&bver, &hver);
- printf("bcftools %s using htslib %s\n", bcftools_version(), hts_version());
- printf("plugin at %s using htslib %s\n\n", bver, hver);
+ fprintf(pysam_stdout, "bcftools %s using htslib %s\n", bcftools_version(), hts_version());
+ fprintf(pysam_stdout, "plugin at %s using htslib %s\n\n", bver, hver);
return 0;
}
if ( usage_only )
{
if ( args->plugin.usage )
- fprintf(pysamerr,"%s",args->plugin.usage());
+ fprintf(pysam_stderr,"%s",args->plugin.usage());
else
- fprintf(pysamerr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name);
+ fprintf(pysam_stderr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name);
return 0;
}
- if ( args->plugin.run )
- {
- int iopt = optind; optind = 0;
- int ret = args->plugin.run(argc-iopt, argv+iopt);
- destroy_data(args);
- free(args);
- return ret;
- }
-
char *fname = NULL;
if ( optind>=argc || argv[optind][0]=='-' )
{
int i;
bcf_sr_t *reader = &args->files->readers[0];
for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
- printf("%s\n", reader->header->samples[i]);
+ fprintf(pysam_stdout, "%s\n", reader->header->samples[i]);
}
static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n");
- fprintf(pysamerr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -c, --collapse <string> collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -f, --format <string> see man page for details\n");
- fprintf(pysamerr, " -H, --print-header print header\n");
- fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -l, --list-samples print the list of samples and exit\n");
- fprintf(pysamerr, " -o, --output-file <file> output file name [stdout]\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --samples <list> list of samples to include\n");
- fprintf(pysamerr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " -u, --allow-undef-tags print \".\" for undefined tags\n");
- fprintf(pysamerr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Examples:\n");
- fprintf(pysamerr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n");
+ fprintf(pysam_stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -c, --collapse <string> collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -f, --format <string> see man page for details\n");
+ fprintf(pysam_stderr, " -H, --print-header print header\n");
+ fprintf(pysam_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -l, --list-samples print the list of samples and exit\n");
+ fprintf(pysam_stderr, " -o, --output-file <file> output file name [pysam_stdout]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of samples to include\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n");
+ fprintf(pysam_stderr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Examples:\n");
+ fprintf(pysam_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
}
if ( !args->format_str ) usage();
- args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout;
+ args->out = args->fn_out ? fopen(args->fn_out, "w") : pysam_stdout;
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
if ( !args->vcf_list )
}
}
- // update the transition matrix tprob
+ // update the transition matrix
+ int n = 1;
for (i=0; i<2; i++)
{
- int n = 0;
for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
- if ( !n) error("fixme: state %d not observed\n", i+1);
- for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
}
+ for (i=0; i<2; i++)
+ {
+ for (j=0; j<2; j++)
+ {
+ // no transition to i-th state was observed, set to a small number
+ if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
+ else MAT(tcounts,2,i,j) /= n;
+ }
+ }
+
+ // normalize
+ for (i=0; i<2; i++)
+ {
+ double norm = 0;
+ for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
+ assert( norm!=0 );
+ for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ }
+
if ( args->genmap_fname || args->rec_rate > 0 )
hmm_set_tprob(args->hmm, tcounts, 0);
else
deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
niter++;
-
- fprintf(stderr,"%d: %f %f\n", niter,deltaz,delthw);
+ fprintf(stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
+ niter,deltaz,delthw,
+ MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
}
while ( deltaz > 0.0 || delthw > 0.0 );
- fprintf(stderr, "Viterbi training converged in %d iterations to", niter);
double *tprob_arr = hmm_get_tprob(args->hmm);
- for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(stderr, " %f", MAT(tprob_arr,2,i,j));
- fprintf(stderr, "\n");
+ fprintf(stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
+ MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
// output the results
for (i=0; i<args->nrids; i++)
int ioff = args->rid_offs[i];
int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
for (j=0; j<nsites; j++)
{
- printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0);
+ int state = vpath[j*2];
+ double pval = fwd[j*2 + state];
+ printf("%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
}
}
}
args->hmm = hmm_init(2, tprob, 10000);
// print header
- printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
- printf("# The command line was:\tbcftools %s", args->argv[0]);
+ fprintf(pysam_stdout, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(pysam_stdout, "# The command line was:\tbcftools %s", args->argv[0]);
for (i=1; i<args->argc; i++)
- printf(" %s",args->argv[i]);
- printf("\n#\n");
- printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+ fprintf(pysam_stdout, " %s",args->argv[i]);
+ fprintf(pysam_stdout, "\n#\n");
+ fprintf(pysam_stdout, "# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
}
static void destroy_data(args_t *args)
{
int state = vpath[i*2]==STATE_AZ ? 1 : 0;
double *pval = fwd + i*2;
- printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
+ fprintf(pysam_stdout, "%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
}
return;
}
}
}
- // update the transition matrix tprob
+ // update the transition matrix
+ int n = 1;
for (i=0; i<2; i++)
{
- int n = 0;
for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
- if ( !n) error("fixme: state %d not observed\n", i+1);
- for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
}
+ for (i=0; i<2; i++)
+ {
+ for (j=0; j<2; j++)
+ {
+ // no transition to i-th state was observed, set to a small number
+ if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
+ else MAT(tcounts,2,i,j) /= n;
+ }
+ }
+
+ // normalize
+ for (i=0; i<2; i++)
+ {
+ double norm = 0;
+ for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
+ assert( norm!=0 );
+ for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ }
+
if ( args->genmap_fname || args->rec_rate > 0 )
hmm_set_tprob(args->hmm, tcounts, 0);
else
deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
niter++;
-
- fprintf(pysamerr,"%d: %f %f\n", niter,deltaz,delthw);
+ fprintf(pysam_stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
+ niter,deltaz,delthw,
+ MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
}
while ( deltaz > 0.0 || delthw > 0.0 );
- fprintf(pysamerr, "Viterbi training converged in %d iterations to", niter);
double *tprob_arr = hmm_get_tprob(args->hmm);
- for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(pysamerr, " %f", MAT(tprob_arr,2,i,j));
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
+ MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
// output the results
for (i=0; i<args->nrids; i++)
int ioff = args->rid_offs[i];
int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
for (j=0; j<nsites; j++)
{
- printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0);
+ int state = vpath[j*2];
+ double pval = fwd[j*2 + state];
+ fprintf(pysam_stdout, "%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
}
}
}
if ( skip_rid )
{
- fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line));
+ fprintf(pysam_stderr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line));
args->skip_rid = line->rid;
return;
}
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: HMM model for detecting runs of autozygosity.\n");
- fprintf(pysamerr, "Usage: bcftools roh [options] <in.vcf.gz>\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "General Options:\n");
- fprintf(pysamerr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
- fprintf(pysamerr, " --AF-tag <TAG> use TAG for allele frequency\n");
- fprintf(pysamerr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(pysamerr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
- fprintf(pysamerr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
- fprintf(pysamerr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
- fprintf(pysamerr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
- fprintf(pysamerr, " -M, --rec-rate <float> constant recombination rate per bp\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --sample <sample> sample to analyze\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "HMM Options:\n");
- fprintf(pysamerr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
- fprintf(pysamerr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
- fprintf(pysamerr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: HMM model for detecting runs of autozygosity.\n");
+ fprintf(pysam_stderr, "Usage: bcftools roh [options] <in.vcf.gz>\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "General Options:\n");
+ fprintf(pysam_stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
+ fprintf(pysam_stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
+ fprintf(pysam_stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(pysam_stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
+ fprintf(pysam_stderr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(pysam_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
+ fprintf(pysam_stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(pysam_stderr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "HMM Options:\n");
+ fprintf(pysam_stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
+ fprintf(pysam_stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
+ fprintf(pysam_stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
vcfroh(args, args->files->readers[0].buffer[0]);
}
vcfroh(args, NULL);
- fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
destroy_data(args);
free(args);
return 0;
/*
* char *t, *p = str;
* t = column_next(p, '\t');
- * if ( strlen("<something>")==t-p && !strncmp(p,"<something>",t-p) ) printf("found!\n");
+ * if ( strlen("<something>")==t-p && !strncmp(p,"<something>",t-p) ) fprintf(pysam_stdout, "found!\n");
*
* char *t;
* t = column_next(str, '\t'); if ( !*t ) error("expected field\n", str);
fprintf(fp,"%e\t%f\t%f\n", prev_score, (float)igood/ngood, (float)ibad/nbad);
if ( !printed && (float)igood/ngood > 0.9 )
{
- printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
+ fprintf(pysam_stdout, "%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
printed = 1;
}
else if ( igood<ngood ) prev_score = good[igood];
else prev_score = bad[ibad];
}
- if ( !printed ) printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
+ if ( !printed ) fprintf(pysam_stdout, "%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score);
if ( fp )
{
if ( fclose(fp) ) error("%s.eval: fclose failed: %s\n",args->prefix,strerror(errno));
case MERGE_MAX: score = get_max_score(args, -1); break;
case MERGE_AVG: score = get_avg_score(args, -1); break;
}
- printf("%e\n", 1.0 - score/max_score);
+ fprintf(pysam_stdout, "%e\n", 1.0 - score/max_score);
}
annots_reader_close(args);
}
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: SOM (Self-Organizing Map) filtering.\n");
- fprintf(pysamerr, "Usage: bcftools som --train [options] <annots.tab.gz>\n");
- fprintf(pysamerr, " bcftools som --classify [options]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Model training options:\n");
- fprintf(pysamerr, " -f, --nfold <int> n-fold cross-validation (number of maps) [5]\n");
- fprintf(pysamerr, " -p, --prefix <string> prefix of output files\n");
- fprintf(pysamerr, " -s, --size <int> map size [20]\n");
- fprintf(pysamerr, " -t, --train \n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Classifying options:\n");
- fprintf(pysamerr, " -c, --classify \n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Experimental training options (no reason to change):\n");
- fprintf(pysamerr, " -b, --bmu-threshold <float> threshold for selection of best-matching unit [0.9]\n");
- fprintf(pysamerr, " -d, --som-dimension <int> SOM dimension [2]\n");
- fprintf(pysamerr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n");
- fprintf(pysamerr, " -l, --learning-rate <float> learning rate [1.0]\n");
- fprintf(pysamerr, " -m, --merge <min|max|avg> -f merge algorithm [avg]\n");
- fprintf(pysamerr, " -n, --ntrain-sites <int> effective number of training sites [number of good sites]\n");
- fprintf(pysamerr, " -r, --random-seed <int> random seed, 0 for time() [1]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: SOM (Self-Organizing Map) filtering.\n");
+ fprintf(pysam_stderr, "Usage: bcftools som --train [options] <annots.tab.gz>\n");
+ fprintf(pysam_stderr, " bcftools som --classify [options]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Model training options:\n");
+ fprintf(pysam_stderr, " -f, --nfold <int> n-fold cross-validation (number of maps) [5]\n");
+ fprintf(pysam_stderr, " -p, --prefix <string> prefix of output files\n");
+ fprintf(pysam_stderr, " -s, --size <int> map size [20]\n");
+ fprintf(pysam_stderr, " -t, --train \n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Classifying options:\n");
+ fprintf(pysam_stderr, " -c, --classify \n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Experimental training options (no reason to change):\n");
+ fprintf(pysam_stderr, " -b, --bmu-threshold <float> threshold for selection of best-matching unit [0.9]\n");
+ fprintf(pysam_stderr, " -d, --som-dimension <int> SOM dimension [2]\n");
+ fprintf(pysam_stderr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n");
+ fprintf(pysam_stderr, " -l, --learning-rate <float> learning rate [1.0]\n");
+ fprintf(pysam_stderr, " -m, --merge <min|max|avg> -f merge algorithm [avg]\n");
+ fprintf(pysam_stderr, " -n, --ntrain-sites <int> effective number of training sites [number of good sites]\n");
+ fprintf(pysam_stderr, " -r, --random-seed <int> random seed, 0 for time() [1]\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
case 'd':
args->ndim = atoi(optarg);
if ( args->ndim<2 ) error("Expected -d >=2, got %d\n", args->ndim);
- if ( args->ndim>3 ) fprintf(pysamerr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim);
+ if ( args->ndim>3 ) fprintf(pysam_stderr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim);
break;
case 't': args->action = SOM_TRAIN; break;
case 'c': args->action = SOM_CLASSIFY; break;
static void _indel_ctx_print1(_idc1_t *idc)
{
int i;
- fprintf(stdout, "%d\t", idc->cnt);
+ fprintf(pysam_stdout, "%d\t", idc->cnt);
for (i=0; i<idc->len; i++)
- fputc(idc->seq[i], stdout);
- fputc('\n', stdout);
+ fputc(idc->seq[i], pysam_stdout);
+ fputc('\n', pysam_stdout);
}
static void _indel_ctx_print(indel_ctx_t *ctx)
{
int i;
for (i=0; i<ctx->ndat; i++)
_indel_ctx_print1(&ctx->dat[i]);
- fputc('\n',stdout);
+ fputc('\n',pysam_stdout);
}
#endif
static int _indel_ctx_lookup(indel_ctx_t *ctx, char *seq, int seq_len, int *hit)
}
#if IC_DBG
- fprintf(stdout,"ref: %s\n", ref);
- fprintf(stdout,"alt: %s\n", alt);
- fprintf(stdout,"ctx: %s\n", fai_ref);
+ fprintf(pysam_stdout,"ref: %s\n", ref);
+ fprintf(pysam_stdout,"alt: %s\n", alt);
+ fprintf(pysam_stdout,"ctx: %s\n", fai_ref);
_indel_ctx_print(ctx);
#endif
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(pysamerr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+ default: fprintf(pysam_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
}
#undef BRANCH_INT
}
{
nmm++;
bcf_sr_t *reader = &files->readers[0];
- printf("DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2);
+ fprintf(pysam_stdout, "DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2);
}
else
{
}
}
float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0;
- printf("PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd);
+ fprintf(pysam_stdout, "PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd);
}
}
}
static void print_header(args_t *args)
{
int i;
- printf("# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version());
- printf("# The command line was:\tbcftools %s ", args->argv[0]);
+ fprintf(pysam_stdout, "# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version());
+ fprintf(pysam_stdout, "# The command line was:\tbcftools %s ", args->argv[0]);
for (i=1; i<args->argc; i++)
- printf(" %s",args->argv[i]);
- printf("\n#\n");
+ fprintf(pysam_stdout, " %s",args->argv[i]);
+ fprintf(pysam_stdout, "\n#\n");
- printf("# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n");
+ fprintf(pysam_stdout, "# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n");
if ( args->files->nreaders==1 )
{
const char *fname = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : "<STDIN>";
if ( args->split_by_id )
{
- printf("ID\t0\t%s:known (sites with ID different from \".\")\n", fname);
- printf("ID\t1\t%s:novel (sites where ID column is \".\")\n", fname);
+ fprintf(pysam_stdout, "ID\t0\t%s:known (sites with ID different from \".\")\n", fname);
+ fprintf(pysam_stdout, "ID\t1\t%s:novel (sites where ID column is \".\")\n", fname);
}
else
- printf("ID\t0\t%s\n", fname);
+ fprintf(pysam_stdout, "ID\t0\t%s\n", fname);
}
else
{
const char *fname0 = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : "<STDIN>";
const char *fname1 = strcmp("-",args->files->readers[1].fname) ? args->files->readers[1].fname : "<STDIN>";
- printf("ID\t0\t%s\n", fname0);
- printf("ID\t1\t%s\n", fname1);
- printf("ID\t2\t%s\t%s\n", fname0,fname1);
+ fprintf(pysam_stdout, "ID\t0\t%s\n", fname0);
+ fprintf(pysam_stdout, "ID\t1\t%s\n", fname1);
+ fprintf(pysam_stdout, "ID\t2\t%s\t%s\n", fname0,fname1);
if ( args->verbose_sites )
{
- printf(
+ fprintf(pysam_stdout,
"# Verbose per-site discordance output.\n"
"# PSD\t[2]CHROM\t[3]POS\t[4]Number of matches\t[5]Number of mismatches\t[6]NRD\n");
- printf(
+ fprintf(pysam_stdout,
"# Verbose per-site and per-sample output. Genotype codes: %d:HomRefRef, %d:HomAltAlt, %d:HetAltRef, %d:HetAltAlt, %d:haploidRef, %d:haploidAlt\n"
"# DBG\t[2]CHROM\t[3]POS\t[4]Sample\t[5]GT in %s\t[6]GT in %s\n",
GT_HOM_RR, GT_HOM_AA, GT_HET_RA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A, fname0,fname1);
static void print_stats(args_t *args)
{
int i, id;
- printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+ fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
- printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
+ fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
- printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records);
- printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts);
- printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps);
- printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps);
- printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels);
- printf("SN\t%d\tnumber of others:\t%d\n", id, stats->n_others);
- printf("SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals);
- printf("SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of records:\t%d\n", id, stats->n_records);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of others:\t%d\n", id, stats->n_others);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals);
}
- printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
+ fprintf(pysam_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
int ts=0,tv=0;
for (i=0; i<args->m_af; i++) { ts += stats->af_ts[i]; tv += stats->af_tv[i]; }
- printf("TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0);
+ fprintf(pysam_stdout, "TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0);
}
if ( args->exons_fname )
{
- printf("# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n");
+ fprintf(pysam_stdout, "# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n");
for (id=0; id<args->nstats; id++)
{
int in=args->stats[id].in_frame, out=args->stats[id].out_frame, na=args->stats[id].na_frame;
int in1=args->stats[id].in_frame_alt1, out1=args->stats[id].out_frame_alt1, na1=args->stats[id].na_frame_alt1;
- printf("FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0);
+ fprintf(pysam_stdout, "FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0);
}
}
if ( args->indel_ctx )
{
- printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
+ fprintf(pysam_stdout, "# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
for (id=0; id<args->nstats; id++)
{
int nc = 0, ni = 0, na = args->stats[id].n_repeat_na;
nc += args->stats[id].n_repeat[i][0] + args->stats[id].n_repeat[i][2];
ni += args->stats[id].n_repeat[i][1] + args->stats[id].n_repeat[i][3];
}
- printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0);
+ fprintf(pysam_stdout, "ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0);
}
- printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
+ fprintf(pysam_stdout, "# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
for (id=0; id<args->nstats; id++)
{
for (i=1; i<IRC_RLEN; i++)
{
int nc = args->stats[id].n_repeat[i][0]+args->stats[id].n_repeat[i][2], ni = args->stats[id].n_repeat[i][1]+args->stats[id].n_repeat[i][3];
- printf("ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1,
+ fprintf(pysam_stdout, "ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1,
args->stats[id].n_repeat[i][0],args->stats[id].n_repeat[i][1],args->stats[id].n_repeat[i][2],args->stats[id].n_repeat[i][3],
nc+ni ? (float)nc/(nc+ni) : 0.0);
}
}
}
- printf("# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+ fprintf(pysam_stdout, "# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
- printf("SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0],
+ fprintf(pysam_stdout, "SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0],
stats->af_repeats[0][0]+stats->af_repeats[1][0]+stats->af_repeats[2][0],stats->af_repeats[0][0],stats->af_repeats[1][0],stats->af_repeats[2][0]);
// put the singletons stats into the first AF bin, note that not all of the stats is transferred (i.e. nrd mismatches)
stats->af_snps[1] += stats->af_snps[0];
stats->af_repeats[1][1] += stats->af_repeats[1][0];
stats->af_repeats[2][1] += stats->af_repeats[2][0];
}
- printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+ fprintf(pysam_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
{
if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
- printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
}
}
#if QUAL_STATS
- printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+ fprintf(pysam_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->m_qual; i++)
{
if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue;
- printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+ fprintf(pysam_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
}
}
#endif
for (i=0; i<args->nusr; i++)
{
- printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+ fprintf(pysam_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
for (id=0; id<args->nstats; id++)
{
if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins
float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
- printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+ fprintf(pysam_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
}
}
}
- printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n");
+ fprintf(pysam_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=stats->m_indel-1; i>=0; i--)
- if ( stats->deletions[i] ) printf("IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]);
+ if ( stats->deletions[i] ) fprintf(pysam_stdout, "IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]);
for (i=0; i<stats->m_indel; i++)
- if ( stats->insertions[i] ) printf("IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]);
+ if ( stats->insertions[i] ) fprintf(pysam_stdout, "IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]);
}
- printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n");
+ fprintf(pysam_stdout, "# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n");
for (id=0; id<args->nstats; id++)
{
int t;
for (t=0; t<15; t++)
{
if ( t>>2 == (t&3) ) continue;
- printf("ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]);
+ fprintf(pysam_stdout, "ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]);
}
}
if ( args->files->nreaders>1 && args->files->n_smpl )
{
- printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
+ fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
int x;
for (x=0; x<2; x++)
gtcmp_t *stats;
if ( x==0 )
{
- printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
stats = args->af_gts_snps;
}
else
{
- printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
stats = args->af_gts_indels;
}
uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
nrd_mm[j] += stats[i].mm[j];
}
if ( !i || !n ) continue; // skip singleton stats and empty bins
- printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
}
if ( x==0 )
{
- printf("# NRD and discordance is calculated as follows:\n");
- printf("# m .. number of matches\n");
- printf("# x .. number of mismatches\n");
- printf("# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
- printf("# RR discordance = xRR / (xRR + mRR)\n");
- printf("# RA discordance = xRA / (xRA + mRA)\n");
- printf("# AA discordance = xAA / (xAA + mAA)\n");
- printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
+ fprintf(pysam_stdout, "# NRD and discordance is calculated as follows:\n");
+ fprintf(pysam_stdout, "# m .. number of matches\n");
+ fprintf(pysam_stdout, "# x .. number of mismatches\n");
+ fprintf(pysam_stdout, "# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+ fprintf(pysam_stdout, "# RR discordance = xRR / (xRR + mRR)\n");
+ fprintf(pysam_stdout, "# RA discordance = xRA / (xRA + mRA)\n");
+ fprintf(pysam_stdout, "# AA discordance = xAA / (xAA + mAA)\n");
+ fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
}
else
- printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
+ fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
- printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
+ fprintf(pysam_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)] ? nrd_mm[T2S(GT_HET_RA)]*100.0/(nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)]) : 0,
smpl_r_t *smpl_r_array;
if ( x==0 )
{
- printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+ fprintf(pysam_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_snps;
smpl_r_array = args->smpl_r_snps;
}
else
{
- printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+ fprintf(pysam_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_indels;
smpl_r_array = args->smpl_r_indels;
}
double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
}
- printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r);
- else printf("\t"NA_STRING"\n");
+ fprintf(pysam_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
+ if (smpl_r->n && !isnan(r)) fprintf(pysam_stdout, "\t%f\n", r*r);
+ else fprintf(pysam_stdout, "\t"NA_STRING"\n");
}
}
}
- printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
+ fprintf(pysam_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<stats->dp.m_vals; i++)
{
if ( stats->dp.vals[i]==0 && stats->dp_sites.vals[i]==0 ) continue;
- printf("DP\t%d\t", id);
- if ( i==0 ) printf("<%d", stats->dp.min);
- else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max);
- else printf("%d", idist_i2bin(&stats->dp,i));
- printf("\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0);
- printf("\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0);
+ fprintf(pysam_stdout, "DP\t%d\t", id);
+ if ( i==0 ) fprintf(pysam_stdout, "<%d", stats->dp.min);
+ else if ( i+1==stats->dp.m_vals ) fprintf(pysam_stdout, ">%d", stats->dp.max);
+ else fprintf(pysam_stdout, "%d", idist_i2bin(&stats->dp,i));
+ fprintf(pysam_stdout, "\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0);
}
}
if ( args->files->n_smpl )
{
- printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+ fprintf(pysam_stdout, "# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
- printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+ fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
}
}
- printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n");
+ fprintf(pysam_stdout, "# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
}
int nhom = stats->smpl_indel_homs[i];
int nhet = stats->smpl_indel_hets[i];
- printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom);
+ fprintf(pysam_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom);
}
}
#ifdef HWE_STATS
- printf("# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n");
+ fprintf(pysam_stdout, "# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
if ( !sum_tot ) continue;
int nprn = 3;
- printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
for (j=0; j<args->naf_hwe; j++)
{
sum_tmp += ptr[j];
float frac = (float)sum_tmp/sum_tot;
if ( frac >= 0.75 )
{
- while (nprn>0) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ while (nprn>0) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; }
break;
}
if ( frac >= 0.5 )
{
- while (nprn>1) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ while (nprn>1) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; }
continue;
}
if ( frac >= 0.25 )
{
- while (nprn>2) { printf("\t%f", (float)j/args->naf_hwe); nprn--; }
+ while (nprn>2) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; }
}
}
assert(nprn==0);
- printf("\n");
+ fprintf(pysam_stdout, "\n");
}
}
#endif
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n");
- fprintf(pysamerr, " When two files are given, the program generates separate stats for intersection\n");
- fprintf(pysamerr, " and the complements. By default only sites are compared, -s/-S must given to include\n");
- fprintf(pysamerr, " also sample columns.\n");
- fprintf(pysamerr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
- fprintf(pysamerr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
- fprintf(pysamerr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
- fprintf(pysamerr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
- fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(pysamerr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
- fprintf(pysamerr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
- fprintf(pysamerr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(pysamerr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(pysamerr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysamerr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
- fprintf(pysamerr, " -v, --verbose produce verbose per-site and per-sample output\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n");
+ fprintf(pysam_stderr, " When two files are given, the program generates separate stats for intersection\n");
+ fprintf(pysam_stderr, " and the complements. By default only sites are compared, -s/-S must given to include\n");
+ fprintf(pysam_stderr, " also sample columns.\n");
+ fprintf(pysam_stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
+ fprintf(pysam_stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(pysam_stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
+ fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
+ fprintf(pysam_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysam_stderr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
+ fprintf(pysam_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> file of samples to include\n");
+ fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(pysam_stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
int sample_is_file, force_samples;
char *include_types, *exclude_types;
int include, exclude;
+ int record_cmd_line;
htsFile *out;
}
args_t;
bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
}
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+ else bcf_hdr_sync(args->hdr);
// setup sample data
if (args->sample_names)
fprintf(stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
args->update_info = 1;
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
{"max-af",required_argument,NULL,'Q'},
{"phased",no_argument,NULL,'p'},
{"exclude-phased",no_argument,NULL,'P'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
break;
}
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?': usage(args);
default: error("Unknown argument: %s\n", optarg);
}
int sample_is_file, force_samples;
char *include_types, *exclude_types;
int include, exclude;
+ int record_cmd_line;
htsFile *out;
}
args_t;
bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
}
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+ else bcf_hdr_sync(args->hdr);
// setup sample data
if (args->sample_names)
for (i=0; i<nsmpl; i++) {
if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
if (args->force_samples) {
- fprintf(pysamerr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
+ fprintf(pysam_stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
} else {
error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
}
for (i=0; i<nsmpl; i++) {
if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
if (args->force_samples) {
- fprintf(pysamerr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
+ fprintf(pysam_stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
continue;
} else {
error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
free(smpl);
khash_str2int_destroy(hdr_samples);
if (args->n_samples == 0) {
- fprintf(pysamerr, "Warn: subsetting has removed all samples\n");
+ fprintf(pysam_stderr, "Warn: subsetting has removed all samples\n");
args->sites_only = 1;
}
}
// determine variant types to include/exclude
if (args->include_types || args->exclude_types) {
if (args->include_types && args->exclude_types) {
- fprintf(pysamerr, "Error: only supply one of --include-types, --exclude-types options\n");
+ fprintf(pysam_stderr, "Error: only supply one of --include-types, --exclude-types options\n");
exit(1);
}
char **type_list = 0;
else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
else {
- fprintf(pysamerr, "[E::%s] unknown type\n", type_list[i]);
- fprintf(pysamerr, "Accepted types are snps, indels, mnps, other\n");
+ fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]);
+ fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n");
exit(1);
}
}
else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
else {
- fprintf(pysamerr, "[E::%s] unknown type\n", type_list[i]);
- fprintf(pysamerr, "Accepted types are snps, indels, mnps, other\n");
+ fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]);
+ fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n");
exit(1);
}
}
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
- default: fprintf(pysamerr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+ default: fprintf(pysam_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
}
#undef BRANCH_INT
if (!sample_phased) {
static void usage(args_t *args)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
- fprintf(pysamerr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Output options:\n");
- fprintf(pysamerr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
- fprintf(pysamerr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
- fprintf(pysamerr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
- fprintf(pysamerr, " -o, --output-file <file> output file name [stdout]\n");
- fprintf(pysamerr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(pysamerr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(pysamerr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysamerr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(pysamerr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(pysamerr, " --threads <int> number of extra output compression threads [0]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Subset options:\n");
- fprintf(pysamerr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
- fprintf(pysamerr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
- fprintf(pysamerr, " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" prefix)\n");
- fprintf(pysamerr, " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n");
- fprintf(pysamerr, " --force-samples only warn about unknown subset samples\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Filter options:\n");
- fprintf(pysamerr, " -c/C, --min-ac/--max-ac <int>[:<type>] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
- fprintf(pysamerr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
- fprintf(pysamerr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(pysamerr, " -g, --genotype [^]<hom|het|miss> require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n");
- fprintf(pysamerr, " -i/e, --include/--exclude <expr> select/exclude sites for which the expression is true (see man page for details)\n");
- fprintf(pysamerr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n");
- fprintf(pysamerr, " -m/M, --min-alleles/--max-alleles <int> minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
- fprintf(pysamerr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n");
- fprintf(pysamerr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
- fprintf(pysamerr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
- fprintf(pysamerr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(pysamerr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
- fprintf(pysamerr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
+ fprintf(pysam_stderr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Output options:\n");
+ fprintf(pysam_stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
+ fprintf(pysam_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
+ fprintf(pysam_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -o, --output-file <file> output file name [pysam_stdout]\n");
+ fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
+ fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(pysam_stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(pysam_stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Subset options:\n");
+ fprintf(pysam_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
+ fprintf(pysam_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
+ fprintf(pysam_stderr, " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(pysam_stderr, " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(pysam_stderr, " --force-samples only warn about unknown subset samples\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Filter options:\n");
+ fprintf(pysam_stderr, " -c/C, --min-ac/--max-ac <int>[:<type>] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(pysam_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysam_stderr, " -g, --genotype [^]<hom|het|miss> require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n");
+ fprintf(pysam_stderr, " -i/e, --include/--exclude <expr> select/exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(pysam_stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n");
+ fprintf(pysam_stderr, " -m/M, --min-alleles/--max-alleles <int> minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
+ fprintf(pysam_stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n");
+ fprintf(pysam_stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(pysam_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
+ fprintf(pysam_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(pysam_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(pysam_stderr, "\n");
exit(1);
}
args->update_info = 1;
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
{"max-af",required_argument,NULL,'Q'},
{"phased",no_argument,NULL,'p'},
{"exclude-phased",no_argument,NULL,'P'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
break;
}
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?': usage(args);
default: error("Unknown argument: %s\n", optarg);
}
{
va_list ap;
va_start(ap, format);
- vfprintf(pysamerr, format, ap);
+ vfprintf(pysam_stderr, format, ap);
va_end(ap);
exit(-1);
}
-#define BCFTOOLS_VERSION "1.3"
+#define BCFTOOLS_VERSION "1.3.1"
As pysam is a wrapper around htslib and the samtools package, I
suggest cite `Li et al (2009) <http://www.ncbi.nlm.nih.gov/pubmed/19505943>`.
-Is pysam thread-save?
+Is pysam thread-safe?
=====================
Pysam is a mix of python and C code. Instructions within python are
In alignments with soft clipping part of the query sequence
are not aligned. The unaligned query sequence is still part
- of the alignment record. This is in difference to hard clipped reads.
+ of the alignment record. This is in difference to
+ :term:`hard clipped` reads.
hard clipping
hard clipped
during installation. However, when installing the source tarball on
python 3 or building from the repository, these pre-built C-files are
not present and cython needs to be installed beforehand.
-
-
-
-
-
Release notes
=============
+Release 0.9.1
+=============
+
+This is a bugfix release addressing some installation problems
+in pysam 0.9.0, in particular:
+
+* patch included htslib to work with older libcurl versions, fixes #262.
+* do not require cython for python 3 install, fixes #260
+* FastaFile does not accept filepath_index any more, see #270
+* add AlignedSegment.get_cigar_stats method.
+* py3 bugfix in VariantFile.subset_samples, fixes #272
+* add missing sysconfig import, fixes #278
+* do not redirect stdout, but instead write to a separately
+ created file. This should resolve issues when pysam is used
+ in notebooks or other environments that redirect stdout.
+* wrap htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1
+* use bgzf throughout instead of gzip
+* allow specifying a fasta reference for CRAM file when opening
+ for both read and write, fixes #280
+
Release 0.9.0
=============
# For samtools, type:
# rm -rf samtools
# python import.py samtools download/samtools
+#
# Manually, then:
# modify config.h to set compatibility flags
-# change bamtk.c.pysam.c/main to bamtk.c.pysam.c/samtools_main
#
# For bcftools, type:
# rm -rf bedtools
# python import.py bedtools download/bedtools
+# rm -rf bedtools/test bedtools/plugins
+
+import fnmatch
import os
+import re
+import shutil
import sys
-import fnmatch
+import hashlib
+
+
+EXCLUDE = {
+ "samtools": (
+ "razip.c", "bgzip.c", "main.c",
+ "calDepth.c", "bam2bed.c", "wgsim.c",
+ "md5fa.c", "md5sum-lite.c", "maq2sam.c",
+ "bamcheck.c", "chk_indel.c", "vcf-miniview.c",
+ "htslib-1.3", # do not import twice
+ "hfile_irods.c", # requires irods library
+ ),
+ "bcftools": (
+ "test", "plugins", "peakfit.c",
+ "peakfit.h",
+ # needs to renamed, name conflict with samtools reheader
+ "reheader.c",
+ "polysomy.c"),
+ "htslib": (
+ 'htslib/tabix.c', 'htslib/bgzip.c',
+ 'htslib/htsfile.c', 'htslib/hfile_irods.c'),
+}
+
+
+MAIN = {
+ "samtools": "bamtk",
+ "bcftools": "main"
+}
+
def locate(pattern, root=os.curdir):
def _update_pysam_files(cf, destdir):
'''update pysam files applying redirection of ouput'''
+ basename = os.path.basename(destdir)
for filename in cf:
if not filename:
continue
dest = filename + ".pysam.c"
with open(filename) as infile:
+ lines = "".join(infile.readlines())
with open(dest, "w") as outfile:
outfile.write('#include "pysam.h"\n\n')
- outfile.write(
- re.sub("stderr", "pysamerr", "".join(infile.readlines())))
+ subname, _ = os.path.splitext(os.path.basename(filename))
+ if subname in MAIN.get(basename, []):
+ lines = re.sub("int main\(", "int {}_main(".format(
+ basename), lines)
+ else:
+ lines = re.sub("int main\(", "int {}_{}_main(".format(
+ basename, subname), lines)
+ lines = re.sub("stderr", "pysam_stderr", lines)
+ lines = re.sub("stdout", "pysam_stdout", lines)
+ lines = re.sub(" printf\(", " fprintf(pysam_stdout, ", lines)
+ lines = re.sub("([^kf])puts\(([^)]+)\)",
+ r"\1fputs(\2, pysam_stdout) & fputc('\\n', pysam_stdout)",
+ lines)
+ lines = re.sub("putchar\(([^)]+)\)",
+ r"fputc(\1, pysam_stdout)", lines)
+
+ fn = os.path.basename(filename)
+ # some specific fixes:
+ SPECIFIC_SUBSTITUTIONS = {
+ "bam_md.c": (
+ 'sam_open_format("-", mode_w',
+ 'sam_open_format(pysam_stdout_fn, mode_w'),
+ "phase.c": (
+ 'putc("ACGT"[f->seq[j] == 1? (c&3, pysam_stdout) : (c>>16&3)]);',
+ 'putc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], pysam_stdout);'),
+ "cut_target.c": (
+ 'putc(33 + (cns[j]>>8>>2, pysam_stdout));',
+ 'putc(33 + (cns[j]>>8>>2), pysam_stdout);')
+ }
+ if fn in SPECIFIC_SUBSTITUTIONS:
+ lines = lines.replace(
+ SPECIFIC_SUBSTITUTIONS[fn][0],
+ SPECIFIC_SUBSTITUTIONS[fn][1])
+ outfile.write(lines)
+
with open(os.path.join(destdir, "pysam.h"), "w")as outfile:
outfile.write("""#ifndef PYSAM_H
#define PYSAM_H
#include "stdio.h"
-extern FILE * pysamerr;
+extern FILE * pysam_stderr;
+extern FILE * pysam_stdout;
+extern const char * pysam_stdout_fn;
#endif
""")
if len(sys.argv) != 3:
raise ValueError("import requires dest src")
- dest, srcdir = sys.argv[2:4]
+ dest, srcdir = sys.argv[1:3]
if dest not in EXCLUDE:
raise ValueError("import expected one of %s" %
",".join(EXCLUDE.keys()))
import os
import sys
+import sysconfig
from pysam.libchtslib import *
from pysam.cutils import *
from pysam.samtools import *
import pysam.config
+
# export all the symbols from separate modules
__all__ = \
libchtslib.__all__ +\
from cpython.version cimport PY_MAJOR_VERSION
from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
from libc.string cimport strchr
+from cpython cimport array as c_array
from pysam.cutils cimport force_bytes, force_str, \
charptr_to_str, charptr_to_bytes
# translation tables
# cigar code to character and vice versa
-cdef char* CODE2CIGAR= "MIDNSHP=X"
+cdef char* CODE2CIGAR= "MIDNSHP=XB"
+cdef int NCIGAR_CODES = 10
if PY_MAJOR_VERSION >= 3:
CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
else:
CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
-CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=X])")
+CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
#####################################################################
# typecode guessing
# map type from htslib to python array
cdef char * f = strchr(htslib_types, s)
+
if f == NULL:
- raise ValueError("unknown htslib tag typecode '%s'" % chr(s))
+ return 0
return parray_types[f - htslib_types]
cdef inline uint8_t map_typecode_python_to_htslib(char s):
"""determine value type from type code of array"""
cdef char * f = strchr(parray_types, s)
if f == NULL:
- raise ValueError(
- "unknown conversion for array typecode '%s'" % s)
+ return 0
return htslib_types[f - parray_types]
# optional tag data manipulation
"""
fmts, args = ["<"], []
+ cdef char array_typecode
+
datatype2format = {
b'c': ('b', 1),
b'C': ('B', 1),
elif isinstance(value, array.array):
# binary tags from arrays
if valuetype is None:
- valuetype = force_bytes(chr(
- map_typecode_python_to_htslib(ord(value.typecode))))
+ array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
+
+ if array_typecode == 0:
+ raise ValueError("unsupported type code '{}'"
+ .format(value.typecode))
+ valuetype = force_bytes(chr(array_typecode))
+
if valuetype not in datatype2format:
raise ValueError("invalid value type '%s' (%s)" %
(valuetype, type(valuetype)))
with the cigar string to reconstitute the query or the reference
sequence.
+ Positions corresponding to `N` (skipped region from the reference)
+ in the CIGAR string will not appear in the returned sequence. The
+ MD should correspondingly not contain these. Thus proper tags are::
+
+ Deletion from the reference: cigar=5M1D5M MD=5^C5
+ Skipped region from reference: cigar=5M1N5M MD=10
+
Returns
-------
s[s_idx] = read_sequence[r_idx]
r_idx += 1
s_idx += 1
- elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ elif op == BAM_CDEL:
for i from 0 <= i < l:
s[s_idx] = '-'
s_idx += 1
+ elif op == BAM_CREF_SKIP:
+ pass
elif op == BAM_CINS:
for i from 0 <= i < l:
# encode insertions into reference as lowercase
for i from 0 <= i < l:
result.append(ref_seq[r_idx])
r_idx += 1
- elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ elif op == BAM_CDEL:
for i from 0 <= i < l:
result.append(ref_seq[r_idx])
r_idx += 1
+ elif op == BAM_CREF_SKIP:
+ pass
elif op == BAM_CINS:
r_idx += l
elif op == BAM_CSOFT_CLIP:
return "".join(result)
-
def get_aligned_pairs(self, matches_only=False, with_seq=False):
"""a list of aligned read (query) and reference positions.
else:
qpos += l
- elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ elif op == BAM_CDEL:
if not _matches_only:
if _with_seq:
for i from pos <= i < pos + l:
elif op == BAM_CHARD_CLIP:
pass # advances neither
+ elif op == BAM_CREF_SKIP:
+ if not _matches_only:
+ if _with_seq:
+ for i from pos <= i < pos + l:
+ result.append((None, i, None))
+ else:
+ for i from pos <= i < pos + l:
+ result.append((None, i))
+
+ pos += l
+
elif op == BAM_CPAD:
raise NotImplementedError(
"Padding (BAM_CPAD, 6) is currently not supported. "
return overlap
+ def get_cigar_stats(self):
+ """summary of operations in cigar string.
+
+ The output order in the array is "MIDNSHP=X" followed by a
+ field for the NM tag. If the NM tag is not present, this
+ field will always be 0.
+
+ +-----+--------------+-----+
+ |M |BAM_CMATCH |0 |
+ +-----+--------------+-----+
+ |I |BAM_CINS |1 |
+ +-----+--------------+-----+
+ |D |BAM_CDEL |2 |
+ +-----+--------------+-----+
+ |N |BAM_CREF_SKIP |3 |
+ +-----+--------------+-----+
+ |S |BAM_CSOFT_CLIP|4 |
+ +-----+--------------+-----+
+ |H |BAM_CHARD_CLIP|5 |
+ +-----+--------------+-----+
+ |P |BAM_CPAD |6 |
+ +-----+--------------+-----+
+ |= |BAM_CEQUAL |7 |
+ +-----+--------------+-----+
+ |X |BAM_CDIFF |8 |
+ +-----+--------------+-----+
+ |NM |NM tag |9 |
+ +-----+--------------+-----+
+
+ If no cigar string is present, empty arrays will be returned.
+
+ Parameters
+ ----------
+
+ Returns
+ -------
+
+ arrays : two arrays. The first contains the nucleotide counts within
+ each cigar operation, the second contains the number of blocks for
+ each cigar operation.
+
+ """
+
+ cdef int nfields = NCIGAR_CODES + 1
+
+ cdef c_array.array base_counts = array.array(
+ "I",
+ [0] * nfields)
+ cdef uint32_t [:] base_view = base_counts
+ cdef c_array.array block_counts = array.array(
+ "I",
+ [0] * nfields)
+ cdef uint32_t [:] block_view = block_counts
+
+ cdef bam1_t * src = self._delegate
+ cdef int op
+ cdef uint32_t l
+ cdef int32_t k
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+ if cigar_p == NULL:
+ return None
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ base_view[op] += l
+ block_view[op] += 1
+
+ cdef uint8_t * v = bam_aux_get(src, 'NM')
+ if v != NULL:
+ base_view[nfields - 1] = <int32_t>bam_aux2i(v)
+
+ return base_counts, block_counts
+
#####################################################
## Unsorted as yet
# TODO: capture in CIGAR object
cdef class AlignmentFile:
cdef object _filename
+ cdef object _reference_filename
# pointer to htsFile structure
cdef htsFile * htsfile
"UR", "SP"),
"RG" : ("ID", "SM", "LB", "DS",
"PU", "PI", "CN", "DT",
- "PL", "FO", "KS", "PG"),
+ "PL", "FO", "KS", "PG",
+ "PM"),
"PG" : ("PN", "ID", "VN", "CL",
"PP"),}
"""AlignmentFile(filepath_or_object, mode=None, template=None,
reference_names=None, reference_lengths=None, text=NULL,
header=None, add_sq_text=False, check_header=True, check_sq=True,
- filename=None)
+ reference_filename=None, filename=None)
A :term:`SAM`/:term:`BAM` formatted file.
4. The names (`reference_names`) and lengths
(`reference_lengths`) are supplied directly as lists.
- For writing a CRAM file, the filename of the reference can be
- added through a fasta formatted file (`reference_filename`)
+ When reading or writing a CRAM file, the filename of a FASTA-formatted
+ reference can be specified with `reference_filename`.
By default, if a file is opened in mode 'r', it is checked
for a valid header (`check_header` = True) and a definition of
when reading, check if SQ entries are present in header
(default=True)
+ reference_filename : string
+ Path to a FASTA-formatted reference file. Valid only for CRAM files.
+ When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
+ specified in the header (``UR`` tag), which are normally used to find
+ the reference.
+
filename : string
Alternative to filepath_or_object. Filename of the file
to be opened.
will be closed and a new file will be opened.
'''
cdef char *cfilename
+ cdef char *creference_filename
cdef char *cindexname
cdef char *cmode
cdef bytes bmode = mode.encode('ascii')
self._filename = filename = encode_filename(filename)
+ self._reference_filename = reference_filename = encode_filename(
+ reference_filename)
# FIXME: Use htsFormat when it is available
self.is_stream = filename == b"-"
# is given, the CRAM reference arrays will be built from
# the @SQ header in the header
if self.is_cram and reference_filename:
- # note that fn_aux takes ownership, so create
- # a copy
- fn = encode_filename(reference_filename)
- self.htsfile.fn_aux = strdup(fn)
+ # note that fn_aux takes ownership, so create a copy
+ self.htsfile.fn_aux = strdup(self._reference_filename)
# write header to htsfile
if self.is_bam or self.is_cram or "h" in mode:
"- is it SAM format?" % mode )
# self.header.ignore_sam_err = True
+ # set filename with reference sequences
+ if self.is_cram and reference_filename:
+ creference_filename = self._reference_filename
+ hts_set_opt(self.htsfile,
+ CRAM_OPT_REFERENCE,
+ creference_filename)
+
if check_sq and self.header.n_targets == 0:
raise ValueError(
("file has no sequences defined (mode='%s') - "
multiple_iterators : bool
- If `multiple_iterators` is True (default) multiple
+ If `multiple_iterators` is True, multiple
iterators on the same file can be used at the same time. The
iterator returned will receive its own copy of a filehandle to
the file effectively re-opening the file. Re-opening a file
def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
cdef char *cfilename
+ cdef char *creference_filename
if not samfile.is_open():
raise ValueError("I/O operation on closed file")
self.header = sam_hdr_read(self.htsfile)
assert self.header != NULL
self.owns_samfile = True
+ # options specific to CRAM files
+ if samfile.is_cram and samfile._reference_filename:
+ creference_filename = samfile._reference_filename
+ hts_set_opt(self.htsfile,
+ CRAM_OPT_REFERENCE,
+ creference_filename)
+
else:
self.htsfile = self.samfile.htsfile
self.owns_samfile = False
'missing {:d} requested samples'.format(
len(missing_samples)))
- keep_samples = force_bytes(b','.join(keep_samples))
+ keep_samples = force_bytes(','.join(keep_samples))
cdef char *keep = <char *>keep_samples if keep_samples else NULL
cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
cimport cython
from cpython cimport array
-from pysam.chtslib cimport faidx_t, gzFile, kstring_t
+from pysam.chtslib cimport faidx_t, kstring_t, BGZF
# These functions are put here and not in chtslib.pxd in order
# to avoid warnings for unused functions.
kstring_t seq
kstring_t qual
- gzFile gzopen(char *, char *)
- kseq_t *kseq_init(gzFile)
+ kseq_t *kseq_init(BGZF *)
int kseq_read(kseq_t *)
void kseq_destroy(kseq_t *)
- int gzclose(gzFile)
-
- kstream_t *ks_init(gzFile)
+ kstream_t *ks_init(BGZF *)
void ks_destroy(kstream_t *)
# Retrieve characters from stream until delimiter
cdef class FastxFile:
cdef object _filename
- cdef gzFile fastqfile
+ cdef BGZF * fastqfile
cdef kseq_t * entry
cdef bint persist
+ cdef bint is_remote
cdef kseq_t * getCurrent(self)
cdef int cnext(self)
from pysam.chtslib cimport \
faidx_nseq, fai_load, fai_destroy, fai_fetch, \
faidx_seq_len, \
- faidx_fetch_seq, gzopen, gzclose, hisremote
+ faidx_fetch_seq, hisremote, \
+ bgzf_open, bgzf_close
from pysam.cutils cimport force_bytes, force_str, charptr_to_str
from pysam.cutils cimport encode_filename, from_string_and_size
cdef char *cfilename = self._filename
self.is_remote = hisremote(cfilename)
+ if filepath_index is not None:
+ raise NotImplementedError(
+ "setting an explicit path for the index "
+ "is not implemented")
+
# open file for reading
if (self._filename != b"-"
and not self.is_remote
self.fastafile = NULL
def __dealloc__(self):
- self.close()
+ if self.fastafile != NULL:
+ fai_destroy(self.fastafile)
+ self.fastafile = NULL
# context manager interface
def __enter__(self):
on the file continues.
'''
- self.close()
+ if self.fastqfile != NULL:
+ self.close()
- if not os.path.exists(filename):
- raise IOError("no such file or directory: %s" % filename)
+ self._filename = encode_filename(filename)
+ cdef char *cfilename = self._filename
+ self.is_remote = hisremote(cfilename)
+
+ # open file for reading
+ if (self._filename != b"-"
+ and not self.is_remote
+ and not os.path.exists(filename)):
+ raise IOError("file `%s` not found" % filename)
self.persist = persist
- self._filename = encode_filename(filename)
- cdef char *cfilename = self._filename
with nogil:
- self.fastqfile = gzopen(cfilename, "r")
+ self.fastqfile = bgzf_open(cfilename, "r")
self.entry = kseq_init(self.fastqfile)
self._filename = filename
def close(self):
'''close the file.'''
+ if self.fastqfile != NULL:
+ bgzf_close(self.fastqfile)
+ self.fastqfile = NULL
if self.entry != NULL:
- gzclose(self.fastqfile)
- if self.entry:
- kseq_destroy(self.entry)
- self.entry = NULL
+ kseq_destroy(self.entry)
+ self.entry = NULL
def __dealloc__(self):
- self.close()
+ if self.fastqfile != NULL:
+ bgzf_close(self.fastqfile)
+ if self.entry:
+ kseq_destroy(self.entry)
# context manager interface
def __enter__(self):
FILE* PyFile_AsFile(object)
-cdef extern from "zlib.h" nogil:
- ctypedef void * gzFile
- ctypedef int64_t z_off_t
-
- int gzclose(gzFile fp)
- int gzread(gzFile fp, void *buf, unsigned int n)
- char *gzerror(gzFile fp, int *errnum)
-
- gzFile gzopen( char *path, char *mode)
- gzFile gzdopen (int fd, char *mode)
- char * gzgets(gzFile file, char *buf, int len)
- int gzeof(gzFile file)
-
-
cdef extern from "htslib/kstring.h" nogil:
ctypedef struct kstring_t:
size_t l, m
no_compression, gzip, bgzf, custom
compression_maximum
+ enum hts_fmt_option:
+ CRAM_OPT_DECODE_MD,
+ CRAM_OPT_PREFIX,
+ CRAM_OPT_VERBOSITY,
+ CRAM_OPT_SEQS_PER_SLICE,
+ CRAM_OPT_SLICES_PER_CONTAINER,
+ CRAM_OPT_RANGE,
+ CRAM_OPT_VERSION,
+ CRAM_OPT_EMBED_REF,
+ CRAM_OPT_IGNORE_MD5,
+ CRAM_OPT_REFERENCE,
+ CRAM_OPT_MULTI_SEQ_PER_SLICE,
+ CRAM_OPT_NO_REF,
+ CRAM_OPT_USE_BZIP2,
+ CRAM_OPT_SHARED_REF,
+ CRAM_OPT_NTHREADS,
+ CRAM_OPT_THREAD_POOL,
+ CRAM_OPT_USE_LZMA,
+ CRAM_OPT_USE_RANS,
+ CRAM_OPT_REQUIRED_FIELDS,
+ HTS_OPT_COMPRESSION_LEVEL,
+ HTS_OPT_NTHREADS,
+
ctypedef struct htsVersion:
short major, minor
# @param opt The CRAM_OPT_* option.
# @param ... Optional arguments, dependent on the option used.
# @return 0 for success, or negative if an error occurred.
- #int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
+ int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
char **hts_readlines(const char *fn, int *_n)
int close(int fd)
from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \
- gzFile, tbx_t, kstring_t
+ tbx_t, kstring_t, BGZF
# These functions are put here and not in chtslib.pxd in order
# to avoid warnings for unused functions.
kstring_t seq
kstring_t qual
- gzFile gzopen(char *, char *)
- kseq_t *kseq_init(gzFile)
+ kseq_t *kseq_init(BGZF *)
int kseq_read(kseq_t *)
void kseq_destroy(kseq_t *)
- int gzclose(gzFile)
-
- kstream_t *ks_init(gzFile)
+ kstream_t *ks_init(BGZF *)
void ks_destroy(kstream_t *)
# Retrieve characters from stream until delimiter
cdef class tabix_file_iterator:
- cdef gzFile fh
+ cdef BGZF * fh
cdef kstream_t * kstream
cdef kstring_t buffer
cdef size_t size
cdef class GZIterator:
cdef object _filename
- cdef gzFile gzipfile
+ cdef BGZF * gzipfile
cdef kstream_t * kstream
cdef kstring_t buffer
cdef int __cnext__(self)
cimport pysam.ctabixproxies as ctabixproxies
from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
- BGZF, bgzf_open, bgzf_close, bgzf_write, gzFile, \
+ BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
- tbx_destroy, gzopen, gzclose, gzerror, gzdopen, hisremote
+ tbx_destroy, hisremote
from pysam.cutils cimport force_bytes, force_str, charptr_to_str
from pysam.cutils cimport encode_filename, from_string_and_size
cdef int x
result = []
for x from 0 <= x < nsequences:
- result.append(sequences[x])
+ result.append(force_str(sequences[x]))
# htslib instructions:
# only free container, not the sequences themselves
filename = encode_filename(filename)
cdef char *cfilename = filename
with nogil:
- self.gzipfile = gzopen(cfilename, "r")
+ self.gzipfile = bgzf_open(cfilename, "r")
self._filename = filename
self.kstream = ks_init(self.gzipfile)
self.encoding = encoding
def __dealloc__(self):
'''close file.'''
if self.gzipfile != NULL:
- gzclose(self.gzipfile)
+ bgzf_close(self.gzipfile)
self.gzipfile = NULL
if self.buffer.s != NULL:
free(self.buffer.s)
- ks_destroy(self.kstream)
+ if self.kstream != NULL:
+ ks_destroy(self.kstream)
def __iter__(self):
return self
#########################################################
## Iterators for parsing through unindexed files.
#########################################################
-cdef buildGzipError(void *gzfp):
- cdef int errnum = 0
- cdef char *s = gzerror(gzfp, &errnum)
- return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
+# cdef buildGzipError(void *gzfp):
+# cdef int errnum = 0
+# cdef char *s = gzerror(gzfp, &errnum)
+# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
cdef class tabix_file_iterator:
# in this case gzread will directly read from the file without decompression.
# When reading, this will be detected automatically by looking
# for the magic two-byte gzip header.
- self.fh = gzdopen(self.duplicated_fd, 'r')
+ self.fh = bgzf_dopen(self.duplicated_fd, 'r')
if self.fh == NULL:
raise IOError('%s' % strerror(errno))
# gzgets terminates at \n, no need to test
# parser creates a copy
- return self.parser.parse( b, self.buffer.l)
+ return self.parser.parse(b, self.buffer.l)
raise StopIteration
def __dealloc__(self):
free(self.buffer.s)
ks_destroy(self.kstream)
- gzclose(self.fh)
+ bgzf_close(self.fh)
def __next__(self):
return self.__cnext__()
from pysam.cutils cimport force_bytes, force_str, charptr_to_str
from pysam.cutils cimport encode_filename, from_string_and_size
+import collections
+
cdef char *StrOrEmpty(char * buffer):
if buffer == NULL:
return ""
elif op == 3: # != operator
return self.compare(other) != 0
else:
- return NotImplemented
+ err_msg = "op {0} isn't implemented yet".format(op)
+ raise NotImplementedError(err_msg)
cdef take(self, char * buffer, size_t nbytes):
'''start presenting buffer.
def __get__(self):
return self._getindex(1)
def __set__(self, value):
+ if value is None:
+ value = "."
self._setindex(1, value)
property feature:
def __get__(self):
return self._getindex(2)
def __set__(self, value):
+ if value is None:
+ value = "."
self._setindex(2, value)
property start:
return float(v)
def __set__(self, value):
- self._setindex(5, value)
+ if value is None:
+ value = "."
+ self._setindex(5, str(value))
property strand:
'''feature strand.'''
- def __get__(self ):
- return self._getindex(6)
+ def __get__(self):
+ return self._getindex(6)
def __set__(self, value ):
+ if value is None:
+ value = "."
self._setindex(6, value)
property frame:
'''feature frame.'''
def __get__(self):
- return self._getindex(7)
+ v = self._getindex(7)
+ if v == "" or v[0] == '.':
+ return v
+ else:
+ return int(v)
+
def __set__(self, value):
- self._setindex(7, value)
+ if value is None:
+ value = "."
+ self._setindex(7, str(value))
property attributes:
'''feature attributes (as a string).'''
def __get__(self):
if self.hasOwnAttributes:
- return self._attributes
+ return force_str(self._attributes)
else:
- return self._getindex(8)
+ return force_str(self._getindex(8))
def __set__( self, value):
if self.hasOwnAttributes:
free(self._attributes)
# Remove white space to prevent a last empty field.
fields = [x.strip() for x in attributes.strip().split("; ")]
- result = {}
+ result = collections.OrderedDict()
for f in fields:
else:
aa.append( '%s %s' % (k,str(v)) )
- a = "; ".join( aa ) + ";"
+ a = force_bytes("; ".join(aa) + ";")
p = a
l = len(a)
self._attributes = <char *>calloc(l + 1, sizeof(char))
str(self.start+1),
str(self.end),
toDot(self.score),
- self.strand,
- self.frame,
- self.attributes ) )
+ toDot(self.strand),
+ toDot(self.frame),
+ self.attributes))
else:
return TupleProxy.__str__(self)
r[name] = value
self.fromDict(r)
+ def __cmp__(self, other):
+ return (self.contig, self.strand, self.start) < \
+ (other.contig, other.strand, other.start)
+
+ # python 3 compatibility
+ def __richcmp__(GTFProxy self, GTFProxy other, int op):
+ if op == 0:
+ return (self.contig, self.strand, self.start) < \
+ (other.contig, other.strand, other.start)
+ elif op == 1:
+ return (self.contig, self.strand, self.start) <= \
+ (other.contig, other.strand, other.start)
+ elif op == 2:
+ return self.compare(other) == 0
+ elif op == 3:
+ return self.compare(other) != 0
+ else:
+ err_msg = "op {0} isn't implemented yet".format(op)
+ raise NotImplementedError(err_msg)
+
cdef class NamedTupleProxy(TupleProxy):
# do automatic conversion
self.contig = self.fields[0]
- self.start = atoi( self.fields[1] )
- self.end = atoi( self.fields[2] )
+ self.start = atoi(self.fields[1])
+ self.end = atoi(self.fields[2])
# __setattr__ in base class seems to take precedence
# hence implement setters in __setattr__
int bcftools_main(int argc, char *argv[])
void pysam_set_stderr(int fd)
void pysam_unset_stderr()
+ void pysam_set_stdout(int fd)
+ void pysam_set_stdout_fn(const char *)
+ void pysam_unset_stdout()
void set_optind(int)
from libc.string cimport strncpy
from libc.stdio cimport fprintf, stderr, fflush
from libc.stdio cimport stdout as c_stdout
+from posix.fcntl cimport open as c_open, O_WRONLY
#####################################################################
# hard-coded constants
return force_bytes(reference), rstart, rend
-@contextmanager
-def stdout_redirector(to=os.devnull):
- '''
- import os
-
- with stdout_redirected(to=filename):
- print("from Python")
- os.system("echo non-Python applications are also supported")
-
- see http://stackoverflow.com/questions/5081657/how-do-i-prevent-a-c-shared-library-to-print-on-stdout-in-python/17954769#17954769
- '''
- fd = sys.stdout.fileno()
-
- def _redirect_stdout(to):
- # flush C-level stdout
- try:
- fflush(c_stdout)
- sys.stdout.close()
- except (OSError, IOError):
- # some tools close stdout
- # Py3: OSError
- # Py2: IOError
- pass
-
- # fd writes to 'to' file
- os.dup2(to.fileno(), fd)
- # Python writes to fd
- if IS_PYTHON3:
- sys.stdout = io.TextIOWrapper(
- os.fdopen(fd, 'wb'))
- else:
- sys.stdout = os.fdopen(fd, 'w')
-
- with os.fdopen(os.dup(fd), 'w') as old_stdout:
- _redirect_stdout(to)
- try:
- yield # allow code to be run with the redirected stdout
- finally:
- _redirect_stdout(old_stdout)
- # restore stdout.
- # buffering and flags may be different
-
-# def stdout_redirector(stream):
-# """
-# See discussion in:
-
-# http://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/
-# """
-
-# # The original fd stdout points to. Usually 1 on POSIX systems.
-# original_stdout_fd = sys.stdout.fileno()
-# print ("original_fd=", original_stdout_fd)
-# def _redirect_stdout(to_fd):
-# """Redirect stdout to the given file descriptor."""
-# # Flush the C-level buffer stdout
-# fflush(c_stdout)
-# # Flush and close sys.stdout - also closes the file descriptor
-# # (fd)
-# sys.stdout.close()
-# # Make original_stdout_fd point to the same file as to_fd
-# os.dup2(to_fd, original_stdout_fd)
-# # Create a new sys.stdout that points to the redirected fd
-# if IS_PYTHON3:
-# sys.stdout = io.TextIOWrapper(
-# os.fdopen(original_stdout_fd, 'wb'))
-
-# # Save a copy of the original stdout fd in saved_stdout_fd
-# saved_stdout_fd = os.dup(original_stdout_fd)
-# try:
-# # Create a temporary file and redirect stdout to it
-# tfile = tempfile.TemporaryFile(mode='w+b')
-# _redirect_stdout(tfile.fileno())
-# # Yield to caller, then redirect stdout back to the saved fd
-# yield
-# _redirect_stdout(saved_stdout_fd)
-# # Copy contents of temporary file to the given stream
-# tfile.flush()
-# tfile.seek(0, io.SEEK_SET)
-# stream.write(tfile.read())
-# finally:
-# tfile.close()
-# os.close(saved_stdout_fd)
-
-
def _pysam_dispatch(collection,
method,
- args=(),
- catch_stdout=True):
+ args=None,
+ catch_stdout=True,
+ save_stdout=None):
'''call ``method`` in samtools/bcftools providing arguments in args.
- .. note::
- This method redirects stdout to capture it
- from samtools. If for some reason stdout disappears
- the reason might be in this method.
-
- .. note::
- This method captures stdout and stderr using temporary files,
- which are then read into memory in their entirety. This method
- is slow and might cause large memory overhead.
-
- Catching of stdout can be turned of by setting *catch_stdout* to
+ Catching of stdout can be turned off by setting *catch_stdout* to
False.
- See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
- on the topic of redirecting stderr/stdout.
-
'''
- # note that debugging this module can be a problem
- # as stdout/stderr will not appear on the terminal
- # some special cases
if method == "index":
if not os.path.exists(args[0]):
raise IOError("No such file or directory: '%s'" % args[0])
+
+ if args is None:
+ args = []
+ else:
+ args = list(args)
- # redirect stderr and stdout to file
+ # redirect stderr to file
stderr_h, stderr_f = tempfile.mkstemp()
pysam_set_stderr(stderr_h)
+ # redirect stdout to file
+ if save_stdout:
+ stdout_f = save_stdout
+ stdout_h = c_open(force_bytes(stdout_f),
+ O_WRONLY)
+ if stdout_h == -1:
+ raise OSError("error while opening {} for writing".format(stdout_f))
+
+ pysam_set_stdout_fn(force_bytes(stdout_f))
+ pysam_set_stdout(stdout_h)
+ elif catch_stdout:
+ stdout_h, stdout_f = tempfile.mkstemp()
+
+ MAP_STDOUT_OPTIONS = {
+ "samtools": {
+ "view": "-o {}",
+ "mpileup": "-o {}",
+ "depad": "-o {}",
+ "calmd": "", # uses pysam_stdout_fn
+ },
+ "bcftools": {}
+ }
+
+ stdout_option = None
+ if collection == "bcftools":
+ # in bcftools, most methods accept -o, the exceptions
+ # are below:
+ if method not in ("index", "roh", "stats"):
+ stdout_option = "-o {}"
+ elif method in MAP_STDOUT_OPTIONS[collection]:
+ stdout_option = MAP_STDOUT_OPTIONS[collection][method]
+
+ if stdout_option is not None:
+ os.close(stdout_h)
+ pysam_set_stdout_fn(force_bytes(stdout_f))
+ args.extend(stdout_option.format(stdout_f).split(" "))
+ else:
+ pysam_set_stdout(stdout_h)
+ else:
+ pysam_set_stdout_fn("-")
+
# setup the function call to samtools/bcftools main
cdef char ** cargs
cdef int i, n, retval, l
-
n = len(args)
method = force_bytes(method)
collection = force_bytes(collection)
set_optind(0)
# call samtools/bcftools
- if catch_stdout:
- with tempfile.TemporaryFile(mode='w+b') as tfile:
- with stdout_redirector(tfile):
- if collection == b"samtools":
- retval = samtools_main(n + 2, cargs)
- elif collection == b"bcftools":
- retval = bcftools_main(n + 2, cargs)
- tfile.flush()
- tfile.seek(0)
- # do not force str, as output might be binary,
- # for example BAM, VCF.gz, etc.
- out_stdout = tfile.read()
- else:
- if collection == b"samtools":
- retval = samtools_main(n + 2, cargs)
- elif collection == b"bcftools":
- retval = bcftools_main(n + 2, cargs)
- out_stdout = None
+ if collection == b"samtools":
+ retval = samtools_main(n + 2, cargs)
+ elif collection == b"bcftools":
+ retval = bcftools_main(n + 2, cargs)
for i from 0 <= i < n:
free(cargs[i + 2])
free(cargs)
# get error messages
+ def _collect(fn):
+ out = []
+ try:
+ with open(fn, "r") as inf:
+ out = inf.read()
+ except UnicodeDecodeError:
+ with open(fn, "rb") as inf:
+ # read binary output
+ out = inf.read()
+ finally:
+ os.remove(fn)
+ return out
+
pysam_unset_stderr()
- out_stderr = []
- try:
- with open(stderr_f, "r") as inf:
- out_stderr = inf.readlines()
- except UnicodeDecodeError:
- with open( stderr_f, "rb") as inf:
- # read binary output
- out_stderr = inf.read()
- finally:
- os.remove(stderr_f)
+ out_stderr = _collect(stderr_f)
+
+ if save_stdout:
+ pysam_unset_stdout()
+ out_stdout = None
+ elif catch_stdout:
+ pysam_unset_stdout()
+ out_stdout = _collect(stdout_f)
+ else:
+ out_stdout = None
return retval, out_stderr, out_stdout
def __init__(self, vcf):
self.vcf = vcf
self.encoding = vcf.encoding
+
# if len(data) != len(self.vcf._samples):
# self.vcf.error(str(data),
# self.BAD_NUMBER_OF_COLUMNS,
def error(self, line, error, opt=None):
'''raise error.'''
# pass to vcf file for error handling
- return self.vcf.error( line, error, opt )
+ return self.vcf.error(line, error, opt)
cdef update(self, char * buffer, size_t nbytes):
'''update internal data.
if leftalign: self._leftalign = leftalign
self._lines = lines
self.encoding = "ascii"
+ self.tabixfile = None
def error(self,line,error,opt=None):
if error in self._ignored_errors: return
self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
self._parse_header(self.tabixfile.header)
+ def __del__(self):
+ self.close()
+ self.tabixfile = None
+
+ def close(self):
+ if self.tabixfile:
+ self.tabixfile.close()
+ self.tabixfile = None
+
def fetch(self,
reference=None,
start=None,
// #######################################################
// fastq parsing
-KSEQ_INIT(gzFile, gzread)
+// KSEQ_INIT(gzFile, gzread)
+KSEQ_INIT(BGZF *, bgzf_read)
//KSTREAM_INIT( gzFile, gzread, 16384)
#include <ctype.h>
#include <assert.h>
#include <unistd.h>
+#include <stdio.h>
#include "bam.h"
#include "bam_endian.h"
#include "htslib/khash.h"
#include "htslib/knetfile.h"
#include "pysam_util.h"
-// Definition of pysamerr
-#include "stdio.h"
-FILE * pysamerr = NULL;
+
+FILE * pysam_stderr = NULL;
+FILE * pysam_stdout = NULL;
+const char * pysam_stdout_fn = NULL;
+int PYSAM_STDOUT_FILENO = STDOUT_FILENO;
+
FILE * pysam_set_stderr(int fd)
{
- if (pysamerr != NULL)
- fclose(pysamerr);
- pysamerr = fdopen(fd, "w");
- return pysamerr;
+ if (pysam_stderr != NULL)
+ fclose(pysam_stderr);
+ pysam_stderr = fdopen(fd, "w");
+ return pysam_stderr;
}
void pysam_unset_stderr(void)
{
- if (pysamerr != NULL)
- fclose(pysamerr);
- pysamerr = fopen("/dev/null", "w");
+ if (pysam_stderr != NULL)
+ fclose(pysam_stderr);
+ pysam_stderr = fopen("/dev/null", "w");
+}
+
+FILE * pysam_set_stdout(int fd)
+{
+ if (pysam_stdout != NULL)
+ fclose(pysam_stdout);
+ pysam_stdout = fdopen(fd, "w");
+ if (pysam_stdout == NULL)
+ {
+ fprintf(pysam_stderr, "could not set stdout to fd %i", fd);
+ }
+ PYSAM_STDOUT_FILENO = fd;
+ return pysam_stdout;
+}
+
+void pysam_set_stdout_fn(const char *fn)
+{
+ pysam_stdout_fn = fn;
+}
+
+void pysam_unset_stdout(void)
+{
+ if (pysam_stdout != NULL)
+ fclose(pysam_stdout);
+ pysam_stdout = fopen("/dev/null", "w");
+ PYSAM_STDOUT_FILENO = STDOUT_FILENO;
}
void set_optind(int val)
#ifndef PYSAM_UTIL_H
#define PYSAM_UTIL_H
-//////////////////////////////////////////////////////////////////
/*! set pysam standard error to point to file descriptor
Setting the stderr will close the previous stderr.
*/
FILE * pysam_set_stderr(int fd);
-//////////////////////////////////////////////////////////////////
+/*! set pysam standard output to point to file descriptor
+
+ Setting the stderr will close the previous stdout.
+ */
+FILE * pysam_set_stdout(int fd);
+
+/*! set pysam standard output to point to filename
+
+ */
+void pysam_set_stdout_fn(const char * fn);
+
/*! set pysam standard error to /dev/null.
Unsetting the stderr will close the previous stderr.
*/
void pysam_unset_stderr(void);
+/*! set pysam standard error to /dev/null.
+
+ Unsetting the stderr will close the previous stderr.
+ */
+void pysam_unset_stdout(void);
+
int pysam_dispatch(int argc, char *argv[]);
void set_optind(int);
-// Definition of pysamerr
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
-FILE * pysamerr = NULL;
#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700)
/*
'''execute a samtools command.
Keyword arguments:
- catch_stdout -- redirect stdout from the samtools command and return as variable (default True)
+ catch_stdout -- redirect stdout from the samtools command and
+ return as variable (default True)
+ save_stdout -- redirect stdout to a filename.
raw -- ignore any parsers associated with this samtools command.
+ split_lines -- return stdout (if catch_stdout is True and stderr
+ as a list of strings.
'''
retval, stderr, stdout = _pysam_dispatch(
self.collection,
self.dispatch,
args,
- catch_stdout=kwargs.get("catch_stdout", True))
+ catch_stdout=kwargs.get("catch_stdout", True),
+ save_stdout=kwargs.get("save_stdout", None))
+
+ if kwargs.get("split_lines", False):
+ stdout = stdout.splitlines()
+ if stderr:
+ stderr = stderr.splitlines()
if retval:
raise SamtoolsError(
"stdout=%s, stderr=%s" %
(self.collection,
retval,
- "\n".join(stdout),
- "\n".join(stderr)))
+ stdout,
+ stderr))
self.stderr = stderr
'''return the samtools usage information for this command'''
retval, stderr, stdout = csamtools._samtools_dispatch(
self.dispatch)
- return "".join(stderr)
+ return stderr
# pysam versioning information
-__version__ = "0.9.0"
+__version__ = "0.9.1"
-__samtools_version__ = "1.3"
+__samtools_version__ = "1.3.1"
-__htslib_version__ = "1.3"
+__htslib_version__ = "1.3.1"
# install htslib
cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/htslib/releases/download/1.3/htslib-1.3.tar.bz2 > htslib-1.3.tar.bz2
-tar xjvf htslib-1.3.tar.bz2
-cd htslib-1.3
+curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2
+tar xjvf htslib-1.3.1.tar.bz2
+cd htslib-1.3.1
make
-PATH=$PATH:$WORKDIR/external-tools/htslib-1.3
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3
+PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1
# install samtools, compile against htslib
cd $WORKDIR/external-tools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3/samtools-1.3.tar.bz2 > samtools-1.3.tar.bz2
-tar xjvf samtools-1.3.tar.bz2
-cd samtools-1.3
-./configure --with-htslib=../htslib-1.3
+curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2
+tar xjvf samtools-1.3.1.tar.bz2
+cd samtools-1.3.1
+./configure --with-htslib=../htslib-1.3.1
make
-PATH=$PATH:$WORKDIR/external-tools/samtools-1.3
+PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1
echo "installed samtools"
samtools --version
# install bcftools
cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/bcftools/releases/download/1.3/bcftools-1.3.tar.bz2 > bcftools-1.3.tar.bz2
-tar xjf bcftools-1.3.tar.bz2
-cd bcftools-1.3
-./configure --with-htslib=../htslib-1.3
+curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2
+tar xjf bcftools-1.3.1.tar.bz2
+cd bcftools-1.3.1
+./configure --with-htslib=../htslib-1.3.1
make
-PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3
+PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1
echo "installed bcftools"
bcftools --version
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <ctype.h>
#include <errno.h>
{
kstring_t str;
str.l = str.m = 0; str.s = NULL;
- sam_format1(header, b, &str);
+ if (sam_format1(header, b, &str) < 0) {
+ free(str.s);
+ str.s = NULL;
+ return NULL;
+ }
return str.s;
}
-void bam_view1(const bam_header_t *header, const bam1_t *b)
+int bam_view1(const bam_header_t *header, const bam1_t *b)
{
char *s = bam_format1(header, b);
- puts(s);
+ int ret = -1;
+ if (!s) return -1;
+ if (puts(s) != EOF) ret = 0;
free(s);
+ return ret;
}
int bam_validate1(const bam_header_t *header, const bam1_t *b)
last = *cp++;
}
+ if (!ID || !LB)
+ continue;
+
// Check it's the correct ID
if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t')
continue;
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <ctype.h>
#include <errno.h>
{
kstring_t str;
str.l = str.m = 0; str.s = NULL;
- sam_format1(header, b, &str);
+ if (sam_format1(header, b, &str) < 0) {
+ free(str.s);
+ str.s = NULL;
+ return NULL;
+ }
return str.s;
}
-void bam_view1(const bam_header_t *header, const bam1_t *b)
+int bam_view1(const bam_header_t *header, const bam1_t *b)
{
char *s = bam_format1(header, b);
- puts(s);
+ int ret = -1;
+ if (!s) return -1;
+ if (fputs(s, pysam_stdout) & fputc('\n', pysam_stdout) != EOF) ret = 0;
free(s);
+ return ret;
}
int bam_validate1(const bam_header_t *header, const bam1_t *b)
last = *cp++;
}
+ if (!ID || !LB)
+ continue;
+
// Check it's the correct ID
if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t')
continue;
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.3"
+#define BAM_VERSION "1.3.1"
#include <stdint.h>
#include <stdlib.h>
*/
char *bam_format1(const bam_header_t *header, const bam1_t *b);
- /*! @abstract Formats a BAM record and writes it and \n to stdout */
- void bam_view1(const bam_header_t *header, const bam1_t *b);
+ /*!
+ @abstract Formats a BAM record and writes it and \n to stdout
+ @return 0 if successful, -1 on error
+ */
+ int bam_view1(const bam_header_t *header, const bam1_t *b);
/*!
@abstract Check whether a BAM record is plausibly valid
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include <stdint.h>
#include <assert.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include <stdint.h>
#include <assert.h>
if ( cig==BAM_CHARD_CLIP ) continue;
if ( cig==BAM_CPAD ) continue;
if ( cig==BAM_CREF_SKIP ) continue;
- fprintf(pysamerr,"todo: cigar %d\n", cig);
+ fprintf(pysam_stderr,"todo: cigar %d\n", cig);
assert(0);
}
*len = n_tot_bases;
double sum = 0;
const double log2 = log(2.0);
- // fprintf(pysamerr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
+ // fprintf(pysam_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
int i;
for (i=0; i<call->n; i++)
{
else
tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
sum += tmp;
- // fprintf(pysamerr,"oi=%d %e\n", oi,tmp);
+ // fprintf(pysam_stderr,"oi=%d %e\n", oi,tmp);
}
call->seg_bias = sum;
}
}
}
-// if (ref_base < 0) fprintf(pysamerr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+// if (ref_base < 0) fprintf(pysam_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
call->shift = (int)(sum_min + .499);
}
// combine annotations
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <assert.h>
#include <ctype.h>
#include <string.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <assert.h>
#include <ctype.h>
#include <string.h>
free(aux);
// TODO revisit how/whether to control printing this warning
if (hts_verbose >= 2)
- fprintf(pysamerr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
+ fprintf(pysam_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
return -1;
}
types = (int*)calloc(n_types, sizeof(int));
if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
if (max_i >= 0) r[max_i] = 15;
if (max2_i >= 0) r[max2_i] = 15;
- //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysamerr); fputc('\n', pysamerr);
+ //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysam_stderr); fputc('\n', pysam_stderr);
}
free(ref0); free(cns);
}
else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
else ir = est_indelreg(pos, ref, -types[t], 0);
if (ir > bca->indelreg) bca->indelreg = ir;
-// fprintf(pysamerr, "%d, %d, %d\n", pos, types[t], ir);
+// fprintf(pysam_stderr, "%d, %d, %d\n", pos, types[t], ir);
// realignment
for (s = K = 0; s < n; ++s) {
// write ref2
}
/*
for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
- fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysamerr);
- fputc('\n', pysamerr);
- for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysamerr);
- fputc('\n', pysamerr);
- fprintf(pysamerr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
+ fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr);
+ fputc('\n', pysam_stderr);
+ for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr);
+ fputc('\n', pysam_stderr);
+ fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
*/
}
}
if (seqQ > 255) seqQ = 255;
p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-// fprintf(pysamerr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+// fprintf(pysam_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
}
}
// determine bca->indel_types[] and bca->inscns
if (x == bca->indel_types[j]) break;
p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
if ((p->aux>>16&0x3f) > 0) ++n_alt;
- //fprintf(pysamerr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+ //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
}
* gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz
*/
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
* gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz
*/
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
int read_file_list(const char *file_list,int *n,char **argv[]);
static int usage() {
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -a output all positions (including zero depth)\n");
- fprintf(pysamerr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
- fprintf(pysamerr, " -b <bed> list of positions or regions\n");
- fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(pysamerr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
- fprintf(pysamerr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
- fprintf(pysamerr, " -q <int> base quality threshold\n");
- fprintf(pysamerr, " -Q <int> mapping quality threshold\n");
- fprintf(pysamerr, " -r <chr:from-to> region\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -a output all positions (including zero depth)\n");
+ fprintf(pysam_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
+ fprintf(pysam_stderr, " -b <bed> list of positions or regions\n");
+ fprintf(pysam_stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
+ fprintf(pysam_stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(pysam_stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
+ fprintf(pysam_stderr, " -q <int> base quality threshold\n");
+ fprintf(pysam_stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(pysam_stderr, " -r <chr:from-to> region\n");
- sam_global_opt_help(pysamerr, "-.--.");
+ sam_global_opt_help(pysam_stderr, "-.--.");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "The output is a simple tab-separated table with three columns: reference name,\n");
- fprintf(pysamerr, "position, and coverage depth. Note that positions with zero coverage may be\n");
- fprintf(pysamerr, "omitted by default; see the -a option.\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
+ fprintf(pysam_stderr, "position, and coverage depth. Note that positions with zero coverage may be\n");
+ fprintf(pysam_stderr, "omitted by default; see the -a option.\n");
+ fprintf(pysam_stderr, "\n");
return 1;
}
rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
if (baseQ) rf |= SAM_QUAL;
if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
return 1;
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
return 1;
}
data[i]->min_mapQ = mapQ; // set the mapQ filter
data[i]->min_len = min_len; // set the qlen filter
data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
if (data[i]->hdr == NULL) {
- fprintf(pysamerr, "Couldn't read header for \"%s\"\n",
+ fprintf(pysam_stderr, "Couldn't read header for \"%s\"\n",
argv[optind+i]);
status = EXIT_FAILURE;
goto depth_end;
while (++last_pos < h->target_len[last_tid]) {
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
continue;
- fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
+ fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1);
for (i = 0; i < n; i++)
- putchar('\t'), putchar('0');
- putchar('\n');
+ fputc('\t', pysam_stdout), fputc('0', pysam_stdout);
+ fputc('\n', pysam_stdout);
}
}
last_tid++;
if (last_pos < beg) continue; // out of range; skip
if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
continue;
- fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1);
+ fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1);
for (i = 0; i < n; i++)
- putchar('\t'), putchar('0');
- putchar('\n');
+ fputc('\t', pysam_stdout), fputc('0', pysam_stdout);
+ fputc('\n', pysam_stdout);
}
last_tid = tid;
last_pos = pos;
}
- fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
+ fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", pos+1); // a customized fprintf(pysam_stdout, ) would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
for (j = 0; j < n_plp[i]; ++j) {
if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
}
- printf("\t%d", n_plp[i] - m); // this the depth to output
+ fprintf(pysam_stdout, "\t%d", n_plp[i] - m); // this the depth to output
}
- putchar('\n');
+ fputc('\n', pysam_stdout);
}
if (ret < 0) status = EXIT_FAILURE;
free(n_plp); free(plp);
if (last_pos >= end) break;
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
continue;
- fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
+ fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1);
for (i = 0; i < n; i++)
- putchar('\t'), putchar('0');
- putchar('\n');
+ fputc('\t', pysam_stdout), fputc('0', pysam_stdout);
+ fputc('\n', pysam_stdout);
}
last_tid++;
last_pos = -1;
}
#ifdef _MAIN_BAM2DEPTH
-int main(int argc, char *argv[])
+int samtools_bam2depth_main(int argc, char *argv[])
{
return main_depth(argc, argv);
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include "samtools.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include "samtools.h"
if (*in == '\\') {
++in;
if (*in == '\0') {
- fprintf(pysamerr, "[%s] Unterminated escape sequence.\n", __func__);
+ fprintf(pysam_stderr, "[%s] Unterminated escape sequence.\n", __func__);
free(out);
return NULL;
}
*ptr = '\t';
break;
case 'n':
- fprintf(pysamerr, "[%s] \\n in escape sequence is not supported.\n", __func__);
+ fprintf(pysam_stderr, "[%s] \\n in escape sequence is not supported.\n", __func__);
free(out);
return NULL;
default:
- fprintf(pysamerr, "[%s] Unsupported escape sequence.\n", __func__);
+ fprintf(pysam_stderr, "[%s] Unsupported escape sequence.\n", __func__);
free(out);
return NULL;
}
"\n"
"Options:\n"
" -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
- " -o FILE Where to write output to [stdout]\n"
+ " -o FILE Where to write output to [pysam_stdout]\n"
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
);
*opts = NULL;
int n;
- if (argc == 1) { usage(stdout); return true; }
+ if (argc == 1) { usage(pysam_stdout); return true; }
parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
if (! retval ) {
- fprintf(pysamerr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
+ fprintf(pysam_stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
return false;
}
// Set defaults
} else if (strcmp(optarg, "orphan_only") == 0) {
retval->mode = orphan_only;
} else {
- usage(pysamerr);
+ usage(pysam_stderr);
return false;
}
break;
retval->output_name = strdup(optarg);
break;
case 'h':
- usage(stdout);
+ usage(pysam_stdout);
free(retval);
return true;
case '?':
- usage(pysamerr);
+ usage(pysam_stderr);
free(retval);
return false;
case 'O':
default:
if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
- usage(pysamerr);
+ usage(pysam_stderr);
free(retval);
return false;
}
retval->rg_line = ks_release(&rg_line);
if (argc-optind < 1) {
- fprintf(pysamerr, "You must specify an input file.\n");
- usage(pysamerr);
+ fprintf(pysam_stderr, "You must specify an input file.\n");
+ usage(pysam_stderr);
cleanup_opts(retval);
return false;
}
if (retval->rg_id && retval->rg_line) {
- fprintf(pysamerr, "The options -r and -R are mutually exclusive.\n");
+ fprintf(pysam_stderr, "The options -r and -R are mutually exclusive.\n");
cleanup_opts(retval);
return false;
}
char* tmp = basic_unescape(retval->rg_line);
if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
- fprintf(pysamerr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
+ fprintf(pysam_stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
free(tmp);
cleanup_opts(retval);
return false;
static bool init(const parsed_opts_t* opts, state_t** state_out) {
state_t* retval = (state_t*) calloc(1, sizeof(state_t));
if (retval == NULL) {
- fprintf(pysamerr, "[init] Out of memory allocating state struct.\n");
+ fprintf(pysam_stderr, "[init] Out of memory allocating state struct.\n");
return false;
}
*state_out = retval;
// Open files
retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
if (retval->input_file == NULL) {
- fprintf(pysamerr, "[init] Could not open input file: %s\n", opts->input_name);
+ fprintf(pysam_stderr, "[init] Could not open input file: %s\n", opts->input_name);
return false;
}
retval->input_header = sam_hdr_read(retval->input_file);
// Append new RG line to header.
// Check does not already exist
if ( confirm_rg(retval->output_header, opts->rg_id) ) {
- fprintf(pysamerr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
+ fprintf(pysam_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
return false;
}
retval->rg_id = strdup(opts->rg_id);
size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2;
char* new_header = malloc(new_len);
if (!new_header) {
- fprintf(pysamerr, "[init] Out of memory whilst writing new header.\n");
+ fprintf(pysam_stderr, "[init] Out of memory whilst writing new header.\n");
return false;
}
sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line);
if (opts->rg_id) {
// Confirm what has been supplied exists
if ( !confirm_rg(retval->output_header, opts->rg_id) ) {
- fprintf(pysamerr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
+ fprintf(pysam_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
return false;
}
retval->rg_id = strdup(opts->rg_id);
} else {
if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) {
- fprintf(pysamerr, "No RG specified on command line or in existing header.\n");
+ fprintf(pysam_stderr, "No RG specified on command line or in existing header.\n");
return false;
}
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include <limits.h>
#include "bam.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include <limits.h>
#include "bam.h"
/* bam_cat.c -- efficiently concatenates bam files.
- Copyright (C) 2008-2009, 2011-2013 Genome Research Ltd.
+ Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd.
Modified SAMtools work copyright (C) 2010 Illumina, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
Illumina.
*/
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "htslib/sam.h"
#include "htslib/cram.h"
#include "htslib/khash.h"
+#include "samtools.h"
KHASH_MAP_INIT_STR(s2i, int)
in = sam_open(fn[i], "rc");
if (in == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ print_error_errno("cat", "fail to open file '%s'", fn[i]);
return NULL;
}
in_c = in->fp.cram;
sprintf(vers, "%d.%d", vers_maj, vers_min);
out = sam_open(outcram, "wc");
if (out == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram);
- return 1;
+ print_error_errno("cat", "fail to open output file '%s'", outcram);
+ return -1;
}
out_c = out->fp.cram;
cram_set_option(out_c, CRAM_OPT_VERSION, vers);
//fprintf(stderr, "Creating cram vers %s\n", vers);
cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed?
- sam_hdr_write(out, new_h);
+ if (sam_hdr_write(out, new_h) < 0) {
+ print_error_errno("cat", "Couldn't write header");
+ return -1;
+ }
for (i = 0; i < nfn; ++i) {
samFile *in;
in = sam_open(fn[i], "rc");
if (in == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ print_error_errno("cat", "fail to open file '%s'", fn[i]);
return -1;
}
in_c = in->fp.cram;
int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
{
- BGZF *fp;
- uint8_t *buf;
+ BGZF *fp, *in = NULL;
+ uint8_t *buf = NULL;
uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
const int es=BGZF_EMPTY_BLOCK_SIZE;
int i;
fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
if (fp == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
- return 1;
+ print_error_errno("cat", "fail to open output file '%s'", outbam);
+ return -1;
+ }
+ if (h) {
+ if (bam_hdr_write(fp, h) < 0) {
+ print_error_errno("cat", "Couldn't write header");
+ goto fail;
+ }
}
- if (h) bam_hdr_write(fp, h);
buf = (uint8_t*) malloc(BUF_SIZE);
+ if (!buf) {
+ fprintf(stderr, "[%s] Couldn't allocate buffer\n", __func__);
+ goto fail;
+ }
for(i = 0; i < nfn; ++i){
- BGZF *in;
bam_hdr_t *old;
int len,j;
in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
if (in == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
- return -1;
+ print_error_errno("cat", "fail to open file '%s'", fn[i]);
+ goto fail;
}
if (in->is_write) return -1;
if (old == NULL) {
fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
__func__, fn[i]);
- bgzf_close(in);
- return -1;
+ goto fail;
+ }
+ if (h == 0 && i == 0) {
+ if (bam_hdr_write(fp, old) < 0) {
+ print_error_errno("cat", "Couldn't write header");
+ goto fail;
+ }
}
- if (h == 0 && i == 0) bam_hdr_write(fp, old);
if (in->block_offset < in->block_length) {
- bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
- bgzf_flush(fp);
+ if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_flush(fp) != 0) goto write_fail;
}
j=0;
int diff=es-len;
if(j==0) {
fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
- return -1;
+ goto fail;
}
- bgzf_raw_write(fp, ebuf, len);
+ if (bgzf_raw_write(fp, ebuf, len) < 0) goto write_fail;
+
memcpy(ebuf,ebuf+len,diff);
memcpy(ebuf+diff,buf,len);
} else {
- if(j!=0) bgzf_raw_write(fp, ebuf, es);
+ if(j!=0) {
+ if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
+ }
len-= es;
memcpy(ebuf,buf+len,es);
- bgzf_raw_write(fp, buf, len);
+ if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail;
}
j=1;
}
if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
fprintf(stderr, " Possible output corruption.\n");
- bgzf_raw_write(fp, ebuf, es);
+ if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
}
}
bam_hdr_destroy(old);
bgzf_close(in);
+ in = NULL;
}
free(buf);
- bgzf_close(fp);
+ if (bgzf_close(fp) < 0) {
+ fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
+ return -1;
+ }
return 0;
+
+ write_fail:
+ fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
+ fail:
+ if (in) bgzf_close(in);
+ if (fp) bgzf_close(fp);
+ free(buf);
+ return -1;
}
{
bam_hdr_t *h = 0;
char *outfn = 0;
- int c, ret;
+ int c, ret = 0;
samFile *in;
while ((c = getopt(argc, argv, "h:o:")) >= 0) {
in = sam_open(argv[optind], "r");
if (!in) {
- fprintf(stderr, "[%s] ERROR: failed to open file '%s'.\n", __func__, argv[optind]);
+ print_error_errno("cat", "failed to open file '%s'", argv[optind]);
return 1;
}
switch (hts_get_format(in)->format) {
case bam:
sam_close(in);
- ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ ret = 1;
break;
case cram:
sam_close(in);
- ret = cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ ret = 1;
break;
default:
/* bam_cat.c -- efficiently concatenates bam files.
- Copyright (C) 2008-2009, 2011-2013 Genome Research Ltd.
+ Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd.
Modified SAMtools work copyright (C) 2010 Illumina, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
Illumina.
*/
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "htslib/sam.h"
#include "htslib/cram.h"
#include "htslib/khash.h"
+#include "samtools.h"
KHASH_MAP_INIT_STR(s2i, int)
in = sam_open(fn[i], "rc");
if (in == 0) {
- fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ print_error_errno("cat", "fail to open file '%s'", fn[i]);
return NULL;
}
in_c = in->fp.cram;
int vmin = cram_minor_vers(in_c);
if ((vers_maj != -1 && vers_maj != vmaj) ||
(vers_min != -1 && vers_min != vmin)) {
- fprintf(pysamerr, "[%s] ERROR: input files have differing version numbers.\n",
+ fprintf(pysam_stderr, "[%s] ERROR: input files have differing version numbers.\n",
__func__);
return NULL;
}
int added;
new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added);
- //fprintf(pysamerr, "RG %s: #%d -> #%d\n",
+ //fprintf(pysam_stderr, "RG %s: #%d -> #%d\n",
// rg2id_in->id[ki], ki, new_rg);
if (added) {
}
if (new_rg != ki && rg2id_in->n_id > 1) {
- fprintf(pysamerr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
+ fprintf(pysam_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
__func__);
return NULL;
}
sprintf(vers, "%d.%d", vers_maj, vers_min);
out = sam_open(outcram, "wc");
if (out == 0) {
- fprintf(pysamerr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram);
- return 1;
+ print_error_errno("cat", "fail to open output file '%s'", outcram);
+ return -1;
}
out_c = out->fp.cram;
cram_set_option(out_c, CRAM_OPT_VERSION, vers);
- //fprintf(pysamerr, "Creating cram vers %s\n", vers);
+ //fprintf(pysam_stderr, "Creating cram vers %s\n", vers);
cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed?
- sam_hdr_write(out, new_h);
+ if (sam_hdr_write(out, new_h) < 0) {
+ print_error_errno("cat", "Couldn't write header");
+ return -1;
+ }
for (i = 0; i < nfn; ++i) {
samFile *in;
in = sam_open(fn[i], "rc");
if (in == 0) {
- fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ print_error_errno("cat", "fail to open file '%s'", fn[i]);
return -1;
}
in_c = in->fp.cram;
// we need to edit the compression header. IF WE CAN.
if (new_rg) {
int zero = 0;
- //fprintf(pysamerr, "Transcode RG %d to %d\n", 0, new_rg);
+ //fprintf(pysam_stderr, "Transcode RG %d to %d\n", 0, new_rg);
cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
} else {
int32_t num_slices;
int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
{
- BGZF *fp;
- uint8_t *buf;
+ BGZF *fp, *in = NULL;
+ uint8_t *buf = NULL;
uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
const int es=BGZF_EMPTY_BLOCK_SIZE;
int i;
- fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
+ fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(pysam_stdout), "w");
if (fp == 0) {
- fprintf(pysamerr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
- return 1;
+ print_error_errno("cat", "fail to open output file '%s'", outbam);
+ return -1;
+ }
+ if (h) {
+ if (bam_hdr_write(fp, h) < 0) {
+ print_error_errno("cat", "Couldn't write header");
+ goto fail;
+ }
}
- if (h) bam_hdr_write(fp, h);
buf = (uint8_t*) malloc(BUF_SIZE);
+ if (!buf) {
+ fprintf(pysam_stderr, "[%s] Couldn't allocate buffer\n", __func__);
+ goto fail;
+ }
for(i = 0; i < nfn; ++i){
- BGZF *in;
bam_hdr_t *old;
int len,j;
in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
if (in == 0) {
- fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
- return -1;
+ print_error_errno("cat", "fail to open file '%s'", fn[i]);
+ goto fail;
}
if (in->is_write) return -1;
old = bam_hdr_read(in);
if (old == NULL) {
- fprintf(pysamerr, "[%s] ERROR: couldn't read header for '%s'.\n",
+ fprintf(pysam_stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
__func__, fn[i]);
- bgzf_close(in);
- return -1;
+ goto fail;
+ }
+ if (h == 0 && i == 0) {
+ if (bam_hdr_write(fp, old) < 0) {
+ print_error_errno("cat", "Couldn't write header");
+ goto fail;
+ }
}
- if (h == 0 && i == 0) bam_hdr_write(fp, old);
if (in->block_offset < in->block_length) {
- bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
- bgzf_flush(fp);
+ if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_flush(fp) != 0) goto write_fail;
}
j=0;
if(len<es){
int diff=es-len;
if(j==0) {
- fprintf(pysamerr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
- return -1;
+ fprintf(pysam_stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
+ goto fail;
}
- bgzf_raw_write(fp, ebuf, len);
+ if (bgzf_raw_write(fp, ebuf, len) < 0) goto write_fail;
+
memcpy(ebuf,ebuf+len,diff);
memcpy(ebuf+diff,buf,len);
} else {
- if(j!=0) bgzf_raw_write(fp, ebuf, es);
+ if(j!=0) {
+ if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
+ }
len-= es;
memcpy(ebuf,buf+len,es);
- bgzf_raw_write(fp, buf, len);
+ if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail;
}
j=1;
}
const uint8_t gzip2=ebuf[1];
const uint32_t isize=*((uint32_t*)(ebuf+es-4));
if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
- fprintf(pysamerr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
- fprintf(pysamerr, " Possible output corruption.\n");
- bgzf_raw_write(fp, ebuf, es);
+ fprintf(pysam_stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
+ fprintf(pysam_stderr, " Possible output corruption.\n");
+ if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
}
}
bam_hdr_destroy(old);
bgzf_close(in);
+ in = NULL;
}
free(buf);
- bgzf_close(fp);
+ if (bgzf_close(fp) < 0) {
+ fprintf(pysam_stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
+ return -1;
+ }
return 0;
+
+ write_fail:
+ fprintf(pysam_stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
+ fail:
+ if (in) bgzf_close(in);
+ if (fp) bgzf_close(fp);
+ free(buf);
+ return -1;
}
{
bam_hdr_t *h = 0;
char *outfn = 0;
- int c, ret;
+ int c, ret = 0;
samFile *in;
while ((c = getopt(argc, argv, "h:o:")) >= 0) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
if (fph == 0) {
- fprintf(pysamerr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
+ fprintf(pysam_stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
return 1;
}
h = sam_hdr_read(fph);
if (h == NULL) {
- fprintf(pysamerr,
+ fprintf(pysam_stderr,
"[%s] ERROR: failed to read the header for '%s'.\n",
__func__, argv[1]);
return 1;
}
}
if (argc - optind < 1) {
- fprintf(pysamerr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+ fprintf(pysam_stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
return 1;
}
in = sam_open(argv[optind], "r");
if (!in) {
- fprintf(pysamerr, "[%s] ERROR: failed to open file '%s'.\n", __func__, argv[optind]);
+ print_error_errno("cat", "failed to open file '%s'", argv[optind]);
return 1;
}
switch (hts_get_format(in)->format) {
case bam:
sam_close(in);
- ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ ret = 1;
break;
case cram:
sam_close(in);
- ret = cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ ret = 1;
break;
default:
sam_close(in);
- fprintf(pysamerr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
+ fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
free(outfn);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include "bam.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include "bam.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
static void usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Convert between textual and numeric flag representation\n");
- fprintf(pysamerr, "Usage: samtools flags INT|STR[,...]\n");
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Flags:\n");
- fprintf(pysamerr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
- fprintf(pysamerr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR);
- fprintf(pysamerr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP);
- fprintf(pysamerr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP);
- fprintf(pysamerr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE);
- fprintf(pysamerr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
- fprintf(pysamerr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1);
- fprintf(pysamerr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2);
- fprintf(pysamerr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY);
- fprintf(pysamerr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL);
- fprintf(pysamerr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP);
- fprintf(pysamerr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Convert between textual and numeric flag representation\n");
+ fprintf(pysam_stderr, "Usage: samtools flags INT|STR[,...]\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Flags:\n");
+ fprintf(pysam_stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
+ fprintf(pysam_stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR);
+ fprintf(pysam_stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP);
+ fprintf(pysam_stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP);
+ fprintf(pysam_stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE);
+ fprintf(pysam_stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
+ fprintf(pysam_stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1);
+ fprintf(pysam_stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2);
+ fprintf(pysam_stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY);
+ fprintf(pysam_stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL);
+ fprintf(pysam_stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP);
+ fprintf(pysam_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
+ fprintf(pysam_stderr, "\n");
}
else
{
int mask = bam_str2flag(argv[1]);
- if ( mask<0 ) { fprintf(pysamerr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; }
- printf("0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask));
+ if ( mask<0 ) { fprintf(pysam_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; }
+ fprintf(pysam_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask));
}
return 0;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <zlib.h>
#include <stdio.h>
#include <string.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <zlib.h>
#include <stdio.h>
#include <string.h>
free(str->s); free(str);
header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : "");
free(samstr.s);
- fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets);
+ fprintf(pysam_stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets);
return header;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/khash.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/khash.h>
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
default:
- index_usage(pysamerr);
+ index_usage(pysam_stderr);
return 1;
}
if (optind == argc) {
- index_usage(stdout);
+ index_usage(pysam_stdout);
return 1;
}
samFile* fp;
if (argc < 2) {
- fprintf(pysamerr, "Usage: samtools idxstats <in.bam>\n");
+ fprintf(pysam_stderr, "Usage: samtools idxstats <in.bam>\n");
return 1;
}
fp = sam_open(argv[1], "r");
- if (fp == NULL) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ if (fp == NULL) { fprintf(pysam_stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysamerr, "[%s] failed to read header for '%s'.\n",
+ fprintf(pysam_stderr, "[%s] failed to read header for '%s'.\n",
__func__, argv[1]);
return 1;
}
idx = sam_index_load(fp, argv[1]);
- if (idx == NULL) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; }
+ if (idx == NULL) { fprintf(pysam_stderr, "[%s] fail to load the index.\n", __func__); return 1; }
int i;
for (i = 0; i < header->n_targets; ++i) {
// Print out contig name and length
- printf("%s\t%d", header->target_name[i], header->target_len[i]);
+ fprintf(pysam_stdout, "%s\t%d", header->target_name[i], header->target_len[i]);
// Now fetch info about it from the meta bin
uint64_t u, v;
hts_idx_get_stat(idx, i, &u, &v);
- printf("\t%" PRIu64 "\t%" PRIu64 "\n", u, v);
+ fprintf(pysam_stdout, "\t%" PRIu64 "\t%" PRIu64 "\n", u, v);
}
// Dump information about unmapped reads
- printf("*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx));
+ fprintf(pysam_stdout, "*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx));
bam_hdr_destroy(header);
hts_idx_destroy(idx);
sam_close(fp);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
}
tv->n_pre = l;
/*
- fprintf(pysamerr, "%d\t", pos+1);
+ fprintf(pysam_stderr, "%d\t", pos+1);
for (i = 0; i < n; ++i) {
const bam_pileup1_t *p = pl + i;
- if (p->is_head) fprintf(pysamerr, "^");
- if (p->is_tail) fprintf(pysamerr, "$");
- fprintf(pysamerr, "%d,", p->level);
+ if (p->is_head) fprintf(pysam_stderr, "^");
+ if (p->is_tail) fprintf(pysam_stderr, "$");
+ fprintf(pysam_stderr, "%d,", p->level);
}
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
*/
return 0;
}
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2014 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2016 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "samtools.h"
/*
* This function calculates ct tag for two bams, it assumes they are from the same template and
}
// currently, this function ONLY works if each read has one hit
-static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
+static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
{
bam_hdr_t *header;
- bam1_t *b[2];
+ bam1_t *b[2] = { NULL, NULL };
int curr, has_prev, pre_end = 0, cur_end = 0;
kstring_t str;
header = sam_hdr_read(in);
if (header == NULL) {
fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n");
- exit(1);
+ return 1;
}
// Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
// (e.g. must ignore in a @CO comment line later in header)
if ((p != 0) && (p < q)) {
fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n");
- exit(1);
+ goto fail;
}
}
- sam_hdr_write(out, header);
+ if (sam_hdr_write(out, header) < 0) goto write_fail;
b[0] = bam_init1();
b[1] = bam_init1();
bam1_t *cur = b[curr], *pre = b[1-curr];
if (cur->core.flag & BAM_FSECONDARY)
{
- if ( !remove_reads ) sam_write1(out, header, cur);
+ if ( !remove_reads ) {
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
+ }
continue; // skip secondary alignments
}
if (cur->core.flag & BAM_FSUPPLEMENTARY)
{
- sam_write1(out, header, cur);
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
}
if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
// Write out result
if ( !remove_reads ) {
- sam_write1(out, header, pre);
- sam_write1(out, header, cur);
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
} else {
// If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
- if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre);
- if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur);
+ if(!(pre->core.flag&BAM_FUNMAP)) {
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
+ }
+ if(!(cur->core.flag&BAM_FUNMAP)) {
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
+ }
}
has_prev = 0;
} else { // unpaired? clear bad info and write it out
}
pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
- if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre);
+ if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
+ }
}
} else has_prev = 1;
curr = 1 - curr;
pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
- sam_write1(out, header, pre);
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
}
bam_hdr_destroy(header);
bam_destroy1(b[0]);
bam_destroy1(b[1]);
free(str.s);
+ return 0;
+
+ write_fail:
+ print_error_errno("fixmate", "Couldn't write to output file");
+ fail:
+ bam_hdr_destroy(header);
+ bam_destroy1(b[0]);
+ bam_destroy1(b[1]);
+ return 1;
}
void usage(FILE* where)
int bam_mating(int argc, char *argv[])
{
- samFile *in, *out;
- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0;
+ samFile *in = NULL, *out = NULL;
+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
case 'c': add_ct = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
- case '?': usage(stderr); return 1;
+ case '?': usage(stderr); goto fail;
}
}
- if (optind+1 >= argc) { usage(stderr); return 1; }
+ if (optind+1 >= argc) { usage(stderr); goto fail; }
// init
if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
- fprintf(stderr, "[bam_mating] cannot open input file\n");
- return 1;
+ print_error_errno("fixmate", "cannot open input file");
+ goto fail;
}
sam_open_mode(wmode+1, argv[optind+1], NULL);
if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) {
- fprintf(stderr, "[bam_mating] cannot open output file\n");
- return 1;
+ print_error_errno("fixmate", "cannot open output file");
+ goto fail;
}
// run
- bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
// cleanup
- sam_close(in); sam_close(out);
+ sam_close(in);
+ if (sam_close(out) < 0) {
+ fprintf(stderr, "[bam_mating] error while closing output file\n");
+ res = 1;
+ }
+
sam_global_args_free(&ga);
+ return res;
- return 0;
+ fail:
+ if (in) sam_close(in);
+ if (out) sam_close(out);
+ sam_global_args_free(&ga);
+ return 1;
}
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2014 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2016 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "samtools.h"
/*
* This function calculates ct tag for two bams, it assumes they are from the same template and
}
// currently, this function ONLY works if each read has one hit
-static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
+static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
{
bam_hdr_t *header;
- bam1_t *b[2];
+ bam1_t *b[2] = { NULL, NULL };
int curr, has_prev, pre_end = 0, cur_end = 0;
kstring_t str;
str.l = str.m = 0; str.s = 0;
header = sam_hdr_read(in);
if (header == NULL) {
- fprintf(pysamerr, "[bam_mating_core] ERROR: Couldn't read header\n");
- exit(1);
+ fprintf(pysam_stderr, "[bam_mating_core] ERROR: Couldn't read header\n");
+ return 1;
}
// Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
// Looking for SO:coordinate within the @HD line only
// (e.g. must ignore in a @CO comment line later in header)
if ((p != 0) && (p < q)) {
- fprintf(pysamerr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n");
- exit(1);
+ fprintf(pysam_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n");
+ goto fail;
}
}
- sam_hdr_write(out, header);
+ if (sam_hdr_write(out, header) < 0) goto write_fail;
b[0] = bam_init1();
b[1] = bam_init1();
bam1_t *cur = b[curr], *pre = b[1-curr];
if (cur->core.flag & BAM_FSECONDARY)
{
- if ( !remove_reads ) sam_write1(out, header, cur);
+ if ( !remove_reads ) {
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
+ }
continue; // skip secondary alignments
}
if (cur->core.flag & BAM_FSUPPLEMENTARY)
{
- sam_write1(out, header, cur);
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
}
if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
// Write out result
if ( !remove_reads ) {
- sam_write1(out, header, pre);
- sam_write1(out, header, cur);
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
} else {
// If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
- if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre);
- if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur);
+ if(!(pre->core.flag&BAM_FUNMAP)) {
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
+ }
+ if(!(cur->core.flag&BAM_FUNMAP)) {
+ if (sam_write1(out, header, cur) < 0) goto write_fail;
+ }
}
has_prev = 0;
} else { // unpaired? clear bad info and write it out
}
pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
- if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre);
+ if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
+ }
}
} else has_prev = 1;
curr = 1 - curr;
pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
- sam_write1(out, header, pre);
+ if (sam_write1(out, header, pre) < 0) goto write_fail;
}
bam_hdr_destroy(header);
bam_destroy1(b[0]);
bam_destroy1(b[1]);
free(str.s);
+ return 0;
+
+ write_fail:
+ print_error_errno("fixmate", "Couldn't write to output file");
+ fail:
+ bam_hdr_destroy(header);
+ bam_destroy1(b[0]);
+ bam_destroy1(b[1]);
+ return 1;
}
void usage(FILE* where)
fprintf(where,
"\n"
-"As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n"
+"As elsewhere in samtools, use '-' as the filename for stdin/pysam_stdout. The input\n"
"file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n"
"input is not accepted.\n");
}
int bam_mating(int argc, char *argv[])
{
- samFile *in, *out;
- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0;
+ samFile *in = NULL, *out = NULL;
+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
};
// parse args
- if (argc == 1) { usage(stdout); return 0; }
+ if (argc == 1) { usage(pysam_stdout); return 0; }
while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'c': add_ct = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
- case '?': usage(pysamerr); return 1;
+ case '?': usage(pysam_stderr); goto fail;
}
}
- if (optind+1 >= argc) { usage(pysamerr); return 1; }
+ if (optind+1 >= argc) { usage(pysam_stderr); goto fail; }
// init
if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
- fprintf(pysamerr, "[bam_mating] cannot open input file\n");
- return 1;
+ print_error_errno("fixmate", "cannot open input file");
+ goto fail;
}
sam_open_mode(wmode+1, argv[optind+1], NULL);
if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) {
- fprintf(pysamerr, "[bam_mating] cannot open output file\n");
- return 1;
+ print_error_errno("fixmate", "cannot open output file");
+ goto fail;
}
// run
- bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
// cleanup
- sam_close(in); sam_close(out);
+ sam_close(in);
+ if (sam_close(out) < 0) {
+ fprintf(pysam_stderr, "[bam_mating] error while closing output file\n");
+ res = 1;
+ }
+
sam_global_args_free(&ga);
+ return res;
- return 0;
+ fail:
+ if (in) sam_close(in);
+ if (out) sam_close(out);
+ sam_global_args_free(&ga);
+ return 1;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include "htslib/kstring.h"
#include "kprobaln.h"
#include "sam_opts.h"
+#include "samtools.h"
#define USE_EQUAL 1
#define DROP_TAG 2
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
- samFile *fp, *fpout = 0;
- bam_hdr_t *header;
- faidx_t *fai;
- char *ref = 0, mode_w[8], *ref_file;
- bam1_t *b;
+ samFile *fp = NULL, *fpout = NULL;
+ bam_hdr_t *header = NULL;
+ faidx_t *fai = NULL;
+ char *ref = NULL, mode_w[8], *ref_file;
+ bam1_t *b = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
if (optind + (ga.reference == NULL) >= argc)
return calmd_usage();
fp = sam_open_format(argv[optind], "r", &ga.in);
- if (fp == 0) return 1;
+ if (fp == NULL) {
+ print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]);
+ return 1;
+ }
header = sam_hdr_read(fp);
if (header == NULL || header->n_targets == 0) {
fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
- return 1;
+ goto fail;
}
fpout = sam_open_format("-", mode_w, &ga.out);
- sam_hdr_write(fpout, header);
+ if (fpout == NULL) {
+ print_error_errno("calmd", "Failed to open output");
+ goto fail;
+ }
+ if (sam_hdr_write(fpout, header) < 0) {
+ print_error_errno("calmd", "Failed to write sam header");
+ goto fail;
+ }
ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
fai = fai_load(ref_file);
if (!fai) {
- perror(ref_file);
- return 1;
+ print_error_errno("calmd", "Failed to open reference file '%s'", ref_file);
+ goto fail;
}
b = bam_init1();
+ if (!b) {
+ fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n");
+ goto fail;
+ }
while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
tid = b->core.tid;
- if (ref == 0)
+ if (ref == 0) { // FIXME: Should this always be fatal?
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
header->target_name[tid]);
+ if (is_realn || capQ > 10) goto fail; // Would otherwise crash
+ }
}
if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
if (capQ > 10) {
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
}
- sam_write1(fpout, header, b);
+ if (sam_write1(fpout, header, b) < 0) {
+ print_error_errno("calmd", "failed to write to output file");
+ goto fail;
+ }
+ }
+ if (ret < -1) {
+ fprintf(stderr, "[bam_fillmd] Error reading input.\n");
+ goto fail;
}
bam_destroy1(b);
bam_hdr_destroy(header);
free(ref);
fai_destroy(fai);
sam_close(fp);
- sam_close(fpout);
+ if (sam_close(fpout) < 0) {
+ fprintf(stderr, "[bam_fillmd] error when closing output file\n");
+ return 1;
+ }
return 0;
+
+ fail:
+ free(ref);
+ if (b) bam_destroy1(b);
+ if (header) bam_hdr_destroy(header);
+ if (fai) fai_destroy(fai);
+ if (fp) sam_close(fp);
+ if (fpout) sam_close(fpout);
+ return 1;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include "htslib/kstring.h"
#include "kprobaln.h"
#include "sam_opts.h"
+#include "samtools.h"
#define USE_EQUAL 1
#define DROP_TAG 2
if (old_nm) old_nm_i = bam_aux2i(old_nm);
if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
else if (nm != old_nm_i) {
- fprintf(pysamerr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
+ fprintf(pysam_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
bam_aux_del(b, old_nm);
bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
}
if (i < str->l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
- fprintf(pysamerr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
+ fprintf(pysam_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
bam_aux_del(b, old_md);
bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
}
if (t > thres) return -1;
if (t < 0) t = 0;
t = sqrt((thres - t) / thres) * thres;
-// fprintf(pysamerr, "%s %lf %d\n", bam_get_qname(b), t, q);
+// fprintf(pysam_stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
return (int)(t + .499);
}
}
int calmd_usage() {
- fprintf(pysamerr,
+ fprintf(pysam_stderr,
"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
"Options:\n"
" -e change identical bases to '='\n"
" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
" -E extended BAQ for better sensitivity but lower specificity\n");
- sam_global_opt_help(pysamerr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....");
return 1;
}
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
- samFile *fp, *fpout = 0;
- bam_hdr_t *header;
- faidx_t *fai;
- char *ref = 0, mode_w[8], *ref_file;
- bam1_t *b;
+ samFile *fp = NULL, *fpout = NULL;
+ bam_hdr_t *header = NULL;
+ faidx_t *fai = NULL;
+ char *ref = NULL, mode_w[8], *ref_file;
+ bam1_t *b = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
case 'A': baq_flag |= 1; break;
case 'E': baq_flag |= 2; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
+ fprintf(pysam_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
/* else fall-through */
case '?': return calmd_usage();
}
if (optind + (ga.reference == NULL) >= argc)
return calmd_usage();
fp = sam_open_format(argv[optind], "r", &ga.in);
- if (fp == 0) return 1;
+ if (fp == NULL) {
+ print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]);
+ return 1;
+ }
header = sam_hdr_read(fp);
if (header == NULL || header->n_targets == 0) {
- fprintf(pysamerr, "[bam_fillmd] input SAM does not have header. Abort!\n");
- return 1;
+ fprintf(pysam_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
+ goto fail;
+ }
+
+ fpout = sam_open_format(pysam_stdout_fn, mode_w, &ga.out);
+ if (fpout == NULL) {
+ print_error_errno("calmd", "Failed to open output");
+ goto fail;
+ }
+ if (sam_hdr_write(fpout, header) < 0) {
+ print_error_errno("calmd", "Failed to write sam header");
+ goto fail;
}
-
- fpout = sam_open_format("-", mode_w, &ga.out);
- sam_hdr_write(fpout, header);
ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
fai = fai_load(ref_file);
if (!fai) {
- perror(ref_file);
- return 1;
+ print_error_errno("calmd", "Failed to open reference file '%s'", ref_file);
+ goto fail;
}
b = bam_init1();
+ if (!b) {
+ fprintf(pysam_stderr, "[bam_fillmd] Failed to allocate bam struct\n");
+ goto fail;
+ }
while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
tid = b->core.tid;
- if (ref == 0)
- fprintf(pysamerr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
+ if (ref == 0) { // FIXME: Should this always be fatal?
+ fprintf(pysam_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
header->target_name[tid]);
+ if (is_realn || capQ > 10) goto fail; // Would otherwise crash
+ }
}
if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
if (capQ > 10) {
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
}
- sam_write1(fpout, header, b);
+ if (sam_write1(fpout, header, b) < 0) {
+ print_error_errno("calmd", "failed to write to output file");
+ goto fail;
+ }
+ }
+ if (ret < -1) {
+ fprintf(pysam_stderr, "[bam_fillmd] Error reading input.\n");
+ goto fail;
}
bam_destroy1(b);
bam_hdr_destroy(header);
free(ref);
fai_destroy(fai);
sam_close(fp);
- sam_close(fpout);
+ if (sam_close(fpout) < 0) {
+ fprintf(pysam_stderr, "[bam_fillmd] error when closing output file\n");
+ return 1;
+ }
return 0;
+
+ fail:
+ free(ref);
+ if (b) bam_destroy1(b);
+ if (header) bam_hdr_destroy(header);
+ if (fai) fai_destroy(fai);
+ if (fp) sam_close(fp);
+ if (fpout) sam_close(fpout);
+ return 1;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
" -b, --bam-list FILE list of input BAM filenames, one per line\n"
" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
-" -d, --max-depth INT max per-BAM depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
fprintf(fp,
" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
" -I, --skip-indels do not perform indel calling\n"
-" -L, --max-idepth INT maximum per-sample depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
if (ma->conf->fai && b->core.tid >= 0) {
has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
- fprintf(pysamerr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
__func__, b->core.pos, ref_len, b->core.tid);
skip = 1;
continue;
if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
if (id < 0 || id >= m->n) {
assert(q); // otherwise a bug
- fprintf(pysamerr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
+ fprintf(pysam_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
exit(EXIT_FAILURE);
}
if (m->n_plp[id] == m->m_plp[id]) {
sm = bam_smpl_init();
if (n == 0) {
- fprintf(pysamerr,"[%s] no input file/data given\n", __func__);
+ fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__);
exit(EXIT_FAILURE);
}
data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in);
if ( !data[i]->fp )
{
- fprintf(pysamerr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
+ fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
exit(EXIT_FAILURE);
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
exit(EXIT_FAILURE);
}
if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
- fprintf(pysamerr, "[%s] failed to process %s: %s\n",
+ fprintf(pysam_stderr, "[%s] failed to process %s: %s\n",
__func__, conf->fai_fname, strerror(errno));
exit(EXIT_FAILURE);
}
data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(data[i]->fp);
if ( !h_tmp ) {
- fprintf(pysamerr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
+ fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
exit(EXIT_FAILURE);
}
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
if (conf->reg) {
hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
if (idx == NULL) {
- fprintf(pysamerr, "[%s] fail to load index for %s\n", __func__, fn[i]);
+ fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
exit(EXIT_FAILURE);
}
if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
- fprintf(pysamerr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
+ fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(EXIT_FAILURE);
}
if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
gplp.m_plp = calloc(sm->n, sizeof(int));
gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*));
- fprintf(pysamerr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
+ fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
// write the VCF header
if (conf->flag & MPLP_BCF)
{
bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
if (bcf_fp == NULL) {
- fprintf(pysamerr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
+ fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
exit(EXIT_FAILURE);
}
}
}
else {
- pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout;
+ pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : pysam_stdout;
if (pileup_fp == NULL) {
- fprintf(pysamerr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
+ fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
exit(EXIT_FAILURE);
}
}
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
max_depth = conf->max_depth;
if (max_depth * sm->n > 1<<20)
- fprintf(pysamerr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
+ fprintf(pysam_stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
if (max_depth * sm->n < 8000) {
max_depth = 8000 / sm->n;
- fprintf(pysamerr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
+ fprintf(pysam_stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
}
max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
if ( c < conf->min_baseQ ) continue;
if (last++) putc(',', pileup_fp);
- fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
+ fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow...
}
}
}
FILE *fh = fopen(file_list,"r");
if ( !fh )
{
- fprintf(pysamerr,"%s: %s\n", file_list,strerror(errno));
+ fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno));
return 1;
}
for (i=0; i<len; i++)
if (!isprint(buf[i])) { safe_to_print = 0; break; }
if ( safe_to_print )
- fprintf(pysamerr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ fprintf(pysam_stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
else
- fprintf(pysamerr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ fprintf(pysam_stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
return 1;
}
fclose(fh);
if ( !nfiles )
{
- fprintf(pysamerr,"No files read from %s\n", file_list);
+ fprintf(pysam_stderr,"No files read from %s\n", file_list);
return 1;
}
*argv = files;
for(i=0; i<n_tags; i++)
{
if ( !strcasecmp(tags[i],"DP") ) flag |= B2B_FMT_DP;
- else if ( !strcasecmp(tags[i],"DV") ) { flag |= B2B_FMT_DV; fprintf(pysamerr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DV") ) { flag |= B2B_FMT_DV; fprintf(pysam_stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
else if ( !strcasecmp(tags[i],"SP") ) flag |= B2B_FMT_SP;
- else if ( !strcasecmp(tags[i],"DP4") ) { flag |= B2B_FMT_DP4; fprintf(pysamerr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
- else if ( !strcasecmp(tags[i],"DPR") ) { flag |= B2B_FMT_DPR; fprintf(pysamerr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(pysamerr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DP4") ) { flag |= B2B_FMT_DP4; fprintf(pysam_stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") ) { flag |= B2B_FMT_DPR; fprintf(pysam_stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(pysam_stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
else if ( !strcasecmp(tags[i],"AD") ) flag |= B2B_FMT_AD;
else if ( !strcasecmp(tags[i],"ADF") ) flag |= B2B_FMT_ADF;
else if ( !strcasecmp(tags[i],"ADR") ) flag |= B2B_FMT_ADR;
else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
else
{
- fprintf(pysamerr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
+ fprintf(pysam_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
exit(EXIT_FAILURE);
}
free(tags[i]);
" -b, --bam-list FILE list of input BAM filenames, one per line\n"
" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
-" -d, --max-depth INT max per-BAM depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
fprintf(fp,
" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
" -I, --skip-indels do not perform indel calling\n"
-" -L, --max-idepth INT maximum per-sample depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
mplp.rflag_require = bam_str2flag(optarg);
- if ( mplp.rflag_require<0 ) { fprintf(pysamerr,"Could not parse --rf %s\n", optarg); return 1; }
+ if ( mplp.rflag_require<0 ) { fprintf(pysam_stderr,"Could not parse --rf %s\n", optarg); return 1; }
break;
case 2 :
mplp.rflag_filter = bam_str2flag(optarg);
- if ( mplp.rflag_filter<0 ) { fprintf(pysamerr,"Could not parse --ff %s\n", optarg); return 1; }
+ if ( mplp.rflag_filter<0 ) { fprintf(pysam_stderr,"Could not parse --ff %s\n", optarg); return 1; }
break;
case 3 : mplp.output_fname = optarg; break;
case 4 : mplp.openQ = atoi(optarg); break;
case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; break;
case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; break;
case 'B': mplp.flag &= ~MPLP_REALN; break;
- case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(pysamerr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break;
- case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(pysamerr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break;
- case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(pysamerr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break;
+ case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(pysam_stderr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break;
+ case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(pysam_stderr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break;
+ case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(pysam_stderr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break;
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
char buf[1024];
mplp.rghash = khash_str2int_init();
if ((fp_rg = fopen(optarg, "r")) == NULL)
- fprintf(pysamerr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
+ fprintf(pysam_stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
khash_str2int_inc(mplp.rghash, strdup(buf));
fclose(fp_rg);
if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
/* else fall-through */
case '?':
- print_usage(pysamerr, &mplp);
+ print_usage(pysam_stderr, &mplp);
return 1;
}
}
if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
{
- fprintf(pysamerr,"Error: The -B option cannot be combined with -E\n");
+ fprintf(pysam_stderr,"Error: The -B option cannot be combined with -E\n");
return 1;
}
if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
if (argc == 1)
{
- print_usage(pysamerr, &mplp);
+ print_usage(pysam_stderr, &mplp);
return 1;
}
int ret;
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/bgzf.h>
"Options:\n"
" -v verbose output (repeat for more verbosity)\n"
"\n"
+"Notes:\n"
+"\n"
+"1. In order to use this command effectively, you should check its exit status;\n"
+" without any -v options it will NOT print any output, even when some files\n"
+" fail the check. One way to use quickcheck might be as a check that all\n"
+" BAM files in a directory are okay:\n"
+"\n"
+"\tsamtools quickcheck *.bam && echo 'all ok' \\\n"
+"\t || echo 'fail!'\n"
+"\n"
+" To also determine which files have failed, use the -v option:\n"
+"\n"
+"\tsamtools quickcheck -v *.bam > bad_bams.fofn \\\n"
+"\t && echo 'all ok' \\\n"
+"\t || echo 'some files failed check, see bad_bams.fofn'\n"
);
}
}
}
- hts_close(hts_fp);
+ if (hts_close(hts_fp) < 0) {
+ file_state |= 32;
+ if (verbose >= 2) fprintf(stderr, "%s did not close cleanly\n", fn);
+ }
}
if (file_state > 0 && verbose >= 1) {
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/bgzf.h>
"Options:\n"
" -v verbose output (repeat for more verbosity)\n"
"\n"
+"Notes:\n"
+"\n"
+"1. In order to use this command effectively, you should check its exit status;\n"
+" without any -v options it will NOT print any output, even when some files\n"
+" fail the check. One way to use quickcheck might be as a check that all\n"
+" BAM files in a directory are okay:\n"
+"\n"
+"\tsamtools quickcheck *.bam && echo 'all ok' \\\n"
+"\t || echo 'fail!'\n"
+"\n"
+" To also determine which files have failed, use the -v option:\n"
+"\n"
+"\tsamtools quickcheck -v *.bam > bad_bams.fofn \\\n"
+"\t && echo 'all ok' \\\n"
+"\t || echo 'some files failed check, see bad_bams.fofn'\n"
);
}
verbose++;
break;
default:
- usage_quickcheck(pysamerr);
+ usage_quickcheck(pysam_stderr);
return 1;
}
}
argv += optind;
if (argc < 1) {
- usage_quickcheck(stdout);
+ usage_quickcheck(pysam_stdout);
return 1;
}
if (verbose >= 2) {
- fprintf(pysamerr, "verbosity set to %d\n", verbose);
+ fprintf(pysam_stderr, "verbosity set to %d\n", verbose);
}
if (verbose >= 4) {
char* fn = argv[i];
int file_state = 0;
- if (verbose >= 3) fprintf(pysamerr, "checking %s\n", fn);
+ if (verbose >= 3) fprintf(pysam_stderr, "checking %s\n", fn);
// attempt to open
htsFile *hts_fp = hts_open(fn, "r");
if (hts_fp == NULL) {
- if (verbose >= 2) fprintf(pysamerr, "%s could not be opened for reading\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading\n", fn);
file_state |= 2;
}
else {
- if (verbose >= 3) fprintf(pysamerr, "opened %s\n", fn);
+ if (verbose >= 3) fprintf(pysam_stderr, "opened %s\n", fn);
// make sure we have sequence data
const htsFormat *fmt = hts_get_format(hts_fp);
if (fmt->category != sequence_data ) {
- if (verbose >= 2) fprintf(pysamerr, "%s was not identified as sequence data\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data\n", fn);
file_state |= 4;
}
else {
- if (verbose >= 3) fprintf(pysamerr, "%s is sequence data\n", fn);
+ if (verbose >= 3) fprintf(pysam_stderr, "%s is sequence data\n", fn);
// check header
bam_hdr_t *header = sam_hdr_read(hts_fp);
if (header->n_targets <= 0) {
- if (verbose >= 2) fprintf(pysamerr, "%s had no targets in header\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header\n", fn);
file_state |= 8;
}
else {
- if (verbose >= 3) fprintf(pysamerr, "%s has %d targets in header\n", fn, header->n_targets);
+ if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header\n", fn, header->n_targets);
}
// only check EOF on BAM for now
// TODO implement and use hts_check_EOF() to include CRAM support
if (fmt->format == bam) {
if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
- if (verbose >= 2) fprintf(pysamerr, "%s was missing EOF block\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block\n", fn);
file_state |= 16;
}
else {
- if (verbose >= 3) fprintf(pysamerr, "%s has good EOF block\n", fn);
+ if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block\n", fn);
}
}
}
- hts_close(hts_fp);
+ if (hts_close(hts_fp) < 0) {
+ file_state |= 32;
+ if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly\n", fn);
+ }
}
if (file_state > 0 && verbose >= 1) {
- fprintf(stdout, "%s\n", fn);
+ fprintf(pysam_stdout, "%s\n", fn);
}
ret |= file_state;
}
/* bam_reheader.c -- reheader subcommand.
Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012-2015 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
const char *arg_list, int add_PG)
{
- BGZF *fp;
+ BGZF *fp = NULL;
ssize_t len;
- uint8_t *buf;
+ uint8_t *buf = NULL;
+ SAM_hdr *sh = NULL;
if (in->is_write) return -1;
buf = malloc(BUF_SIZE);
+ if (!buf) {
+ fprintf(stderr, "Out of memory\n");
+ return -1;
+ }
if (bam_hdr_read(in) == NULL) {
fprintf(stderr, "Couldn't read header\n");
- free(buf);
- return -1;
+ goto fail;
}
fp = bgzf_fdopen(fd, "w");
+ if (!fp) {
+ print_error_errno("reheader", "Couldn't open output file");
+ goto fail;
+ }
if (add_PG) {
// Around the houses, but it'll do until we can manipulate bam_hdr_t natively.
- SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text);
+ sh = sam_hdr_parse_(h->text, h->l_text);
+ if (!sh)
+ goto fail;
if (sam_hdr_add_PG(sh, "samtools",
"VN", samtools_version(),
arg_list ? "CL": NULL,
arg_list ? arg_list : NULL,
NULL) != 0)
- return -1;
+ goto fail;
free(h->text);
h->text = strdup(sam_hdr_str(sh));
h->l_text = sam_hdr_length(sh);
if (!h->text)
- return -1;
+ goto fail;
sam_hdr_free(sh);
+ sh = NULL;
}
- bam_hdr_write(fp, h);
+ if (bam_hdr_write(fp, h) < 0) {
+ print_error_errno("reheader", "Couldn't write header");
+ goto fail;
+ }
if (in->block_offset < in->block_length) {
- bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
- bgzf_flush(fp);
+ if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_flush(fp) < 0) goto write_fail;
+ }
+ while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
+ if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail;
+ }
+ if (len < 0) {
+ fprintf(stderr, "[%s] Error reading input file\n", __func__);
+ goto fail;
}
- while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0)
- bgzf_raw_write(fp, buf, len);
free(buf);
fp->block_offset = in->block_offset = 0;
- bgzf_close(fp);
+ if (bgzf_close(fp) < 0) {
+ fprintf(stderr, "[%s] Error closing output file\n", __func__);
+ return -1;
+ }
return 0;
+
+ write_fail:
+ print_error_errno("reheader", "Error writing to output file");
+ fail:
+ bgzf_close(fp);
+ free(buf);
+ sam_hdr_free(sh);
+ return -1;
}
/*
{ // read the header
samFile *fph = sam_open(argv[optind], "r");
if (fph == 0) {
- fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[optind]);
+ print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]);
return 1;
}
h = sam_hdr_read(fph);
}
in = sam_open(argv[optind+1], inplace?"r+":"r");
if (in == 0) {
- fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[optind+1]);
+ print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]);
return 1;
}
if (hts_get_format(in)->format == bam) {
/* bam_reheader.c -- reheader subcommand.
Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012-2015 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
const char *arg_list, int add_PG)
{
- BGZF *fp;
+ BGZF *fp = NULL;
ssize_t len;
- uint8_t *buf;
+ uint8_t *buf = NULL;
+ SAM_hdr *sh = NULL;
if (in->is_write) return -1;
buf = malloc(BUF_SIZE);
- if (bam_hdr_read(in) == NULL) {
- fprintf(pysamerr, "Couldn't read header\n");
- free(buf);
+ if (!buf) {
+ fprintf(pysam_stderr, "Out of memory\n");
return -1;
}
+ if (bam_hdr_read(in) == NULL) {
+ fprintf(pysam_stderr, "Couldn't read header\n");
+ goto fail;
+ }
fp = bgzf_fdopen(fd, "w");
+ if (!fp) {
+ print_error_errno("reheader", "Couldn't open output file");
+ goto fail;
+ }
if (add_PG) {
// Around the houses, but it'll do until we can manipulate bam_hdr_t natively.
- SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text);
+ sh = sam_hdr_parse_(h->text, h->l_text);
+ if (!sh)
+ goto fail;
if (sam_hdr_add_PG(sh, "samtools",
"VN", samtools_version(),
arg_list ? "CL": NULL,
arg_list ? arg_list : NULL,
NULL) != 0)
- return -1;
+ goto fail;
free(h->text);
h->text = strdup(sam_hdr_str(sh));
h->l_text = sam_hdr_length(sh);
if (!h->text)
- return -1;
+ goto fail;
sam_hdr_free(sh);
+ sh = NULL;
}
- bam_hdr_write(fp, h);
+ if (bam_hdr_write(fp, h) < 0) {
+ print_error_errno("reheader", "Couldn't write header");
+ goto fail;
+ }
if (in->block_offset < in->block_length) {
- bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
- bgzf_flush(fp);
+ if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_flush(fp) < 0) goto write_fail;
+ }
+ while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
+ if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail;
+ }
+ if (len < 0) {
+ fprintf(pysam_stderr, "[%s] Error reading input file\n", __func__);
+ goto fail;
}
- while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0)
- bgzf_raw_write(fp, buf, len);
free(buf);
fp->block_offset = in->block_offset = 0;
- bgzf_close(fp);
+ if (bgzf_close(fp) < 0) {
+ fprintf(pysam_stderr, "[%s] Error closing output file\n", __func__);
+ return -1;
+ }
return 0;
+
+ write_fail:
+ print_error_errno("reheader", "Error writing to output file");
+ fail:
+ bgzf_close(fp);
+ free(buf);
+ sam_hdr_free(sh);
+ return -1;
}
/*
- * Reads a file and outputs a new CRAM file to stdout with 'h'
+ * Reads a file and outputs a new CRAM file to pysam_stdout with 'h'
* replaced as the header. No checks are made to the validity.
*
* FIXME: error checking
if (cram_major_vers(fd) < 2 ||
cram_major_vers(fd) > 3) {
- fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__,
+ fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__,
cram_major_vers(fd));
goto err;
}
goto err;
if (cram_block_get_uncomp_size(b) < header_len+4) {
- fprintf(pysamerr, "New header will not fit. Use non-inplace version (%d > %d)\n",
+ fprintf(pysam_stderr, "New header will not fit. Use non-inplace version (%d > %d)\n",
header_len+4, cram_block_get_uncomp_size(b));
ret = -2;
goto err;
if (cram_major_vers(fd) < 2 ||
cram_major_vers(fd) > 3) {
- fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__,
+ fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__,
cram_major_vers(fd));
goto err;
}
goto err;
if (old_container_sz != container_sz) {
- fprintf(pysamerr, "Quirk of fate makes this troublesome! "
+ fprintf(pysam_stderr, "Quirk of fate makes this troublesome! "
"Please use non-inplace version.\n");
goto err;
}
goto err;
if (cram_block_size(b) > cram_container_get_length(c)) {
- fprintf(pysamerr, "New header will not fit. Use non-inplace version"
+ fprintf(pysam_stderr, "New header will not fit. Use non-inplace version"
" (%d > %d)\n",
(int)cram_block_size(b), cram_container_get_length(c));
ret = -2;
case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG);
case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG);
default:
- fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__,
+ fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__,
cram_major_vers(fd));
return -1;
}
"Options:\n"
" -P, --no-PG Do not generate an @PG header line.\n"
" -i, --in-place Modify the bam/cram file directly.\n"
- " (Defaults to outputting to stdout.)\n");
+ " (Defaults to outputting to pysam_stdout.)\n");
exit(ret);
}
};
while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) {
- fprintf(stderr, " %i %c %s\n", optind, c, argv[optind-1]);
switch (c) {
case 'P': add_PG = 0; break;
case 'i': inplace = 1; break;
- case 'h': usage(stdout, 0); break;
+ case 'h': usage(pysam_stdout, 0); break;
default:
- fprintf(pysamerr, "Invalid option '%c'\n", c);
- usage(pysamerr, 1);
+ fprintf(pysam_stderr, "Invalid option '%c'\n", c);
+ usage(pysam_stderr, 1);
}
}
if (argc - optind != 2)
- usage(pysamerr, 1);
+ usage(pysam_stderr, 1);
{ // read the header
samFile *fph = sam_open(argv[optind], "r");
if (fph == 0) {
- fprintf(pysamerr, "[%s] fail to read the header from %s.\n", __func__, argv[optind]);
+ print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]);
return 1;
}
h = sam_hdr_read(fph);
sam_close(fph);
if (h == NULL) {
- fprintf(pysamerr, "[%s] failed to read the header for '%s'.\n",
+ fprintf(pysam_stderr, "[%s] failed to read the header for '%s'.\n",
__func__, argv[1]);
return 1;
}
}
in = sam_open(argv[optind+1], inplace?"r+":"r");
if (in == 0) {
- fprintf(pysamerr, "[%s] fail to open file %s.\n", __func__, argv[optind+1]);
+ print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]);
return 1;
}
if (hts_get_format(in)->format == bam) {
- r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, add_PG);
+ r = bam_reheader(in->fp.bgzf, h, fileno(pysam_stdout), arg_list, add_PG);
} else {
if (inplace)
r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include "htslib/sam.h"
#include "sam_opts.h"
+#include "samtools.h"
#include "bam.h" // for bam_get_library
typedef bam1_t *bam1_p;
stack->a[stack->n++] = b;
}
-static inline void dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr)
+static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr)
{
int i;
for (i = 0; i != stack->n; ++i) {
- sam_write1(out, hdr, stack->a[i]);
+ if (sam_write1(out, hdr, stack->a[i]) < 0) return -1;
bam_destroy1(stack->a[i]);
+ stack->a[i] = NULL;
}
stack->n = 0;
+ return 0;
+}
+
+static inline void clear_stack(tmp_stack_t *stack) {
+ int i;
+ if (!stack->a) return;
+ for (i = 0; i != stack->n; ++i) {
+ bam_destroy1(stack->a[i]);
+ }
}
static void clear_del_set(khash_t(name) *del_set)
return q;
}
-void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out)
+int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out)
{
- bam1_t *b;
- int last_tid = -1, last_pos = -1;
+ bam1_t *b = NULL;
+ int last_tid = -1, last_pos = -1, r;
tmp_stack_t stack;
khint_t k;
- khash_t(lib) *aux;
- khash_t(name) *del_set;
+ khash_t(lib) *aux = NULL;
+ khash_t(name) *del_set = NULL;
+ memset(&stack, 0, sizeof(tmp_stack_t));
aux = kh_init(lib);
del_set = kh_init(name);
b = bam_init1();
- memset(&stack, 0, sizeof(tmp_stack_t));
+ if (!aux || !del_set || !b) {
+ perror(__func__);
+ goto fail;
+ }
kh_resize(name, del_set, 4 * BUFFER_SIZE);
- while (sam_read1(in, hdr, b) >= 0) {
+ while ((r = sam_read1(in, hdr, b)) >= 0) {
bam1_core_t *c = &b->core;
if (c->tid != last_tid || last_pos != c->pos) {
- dump_best(&stack, out, hdr); // write the result
+ if (dump_best(&stack, out, hdr) < 0) goto write_fail; // write the result
clear_best(aux, BUFFER_SIZE);
if (c->tid != last_tid) {
clear_best(aux, 0);
clear_del_set(del_set);
}
if ((int)c->tid == -1) { // append unmapped reads
- sam_write1(out, hdr, b);
- while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b);
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
+ while ((r = sam_read1(in, hdr, b)) >= 0) {
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
+ }
break;
}
last_tid = c->tid;
}
}
if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
- sam_write1(out, hdr, b);
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
if (k != kh_end(del_set)) {
free((char*)kh_key(del_set, k));
kh_del(name, del_set, k);
- } else sam_write1(out, hdr, b);
+ } else {
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
+ }
}
last_pos = c->pos;
}
+ if (r < -1) {
+ fprintf(stderr, "[%s] failed to read input file\n", __func__);
+ goto fail;
+ }
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
lib_aux_t *q = &kh_val(aux, k);
- dump_best(&stack, out, hdr);
+ if (dump_best(&stack, out, hdr) < 0) goto write_fail;
fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
kh_destroy(pos, q->best_hash);
free((char*)kh_key(aux, k));
+ kh_del(lib, aux, k);
}
}
kh_destroy(lib, aux);
kh_destroy(name, del_set);
free(stack.a);
bam_destroy1(b);
+ return 0;
+
+ write_fail:
+ print_error_errno("rmdup", "failed to write record");
+ fail:
+ clear_stack(&stack);
+ free(stack.a);
+ if (aux) {
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ kh_destroy(pos, q->best_hash);
+ free((char*)kh_key(aux, k));
+ }
+ }
+ kh_destroy(lib, aux);
+ }
+ if (del_set) {
+ clear_del_set(del_set);
+ kh_destroy(name, del_set);
+ }
+ bam_destroy1(b);
+ return 1;
}
-void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se);
+int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se);
static int rmdup_usage(void) {
fprintf(stderr, "\n");
int bam_rmdup(int argc, char *argv[])
{
- int c, is_se = 0, force_se = 0;
+ int c, ret, is_se = 0, force_se = 0;
samFile *in, *out;
bam_hdr_t *header;
char wmode[3] = {'w', 'b', 0};
return rmdup_usage();
in = sam_open_format(argv[optind], "r", &ga.in);
+ if (!in) {
+ print_error_errno("rmdup", "failed to open \"%s\" for input", argv[optind]);
+ return 1;
+ }
header = sam_hdr_read(in);
if (header == NULL || header->n_targets == 0) {
fprintf(stderr, "[bam_rmdup] input SAM does not have header. Abort!\n");
sam_open_mode(wmode+1, argv[optind+1], NULL);
out = sam_open_format(argv[optind+1], wmode, &ga.out);
- if (in == 0 || out == 0) {
- fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+ if (!out) {
+ print_error_errno("rmdup", "failed to open \"%s\" for output", argv[optind+1]);
+ return 1;
+ }
+ if (sam_hdr_write(out, header) < 0) {
+ print_error_errno("rmdup", "failed to write header");
return 1;
}
- sam_hdr_write(out, header);
- if (is_se) bam_rmdupse_core(in, header, out, force_se);
- else bam_rmdup_core(in, header, out);
+ if (is_se) ret = bam_rmdupse_core(in, header, out, force_se);
+ else ret = bam_rmdup_core(in, header, out);
+
bam_hdr_destroy(header);
- sam_close(in); sam_close(out);
- return 0;
+ sam_close(in);
+ if (sam_close(out) < 0) {
+ fprintf(stderr, "[bam_rmdup] error closing output file\n");
+ ret = 1;
+ }
+ return ret;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include "htslib/sam.h"
#include "sam_opts.h"
+#include "samtools.h"
#include "bam.h" // for bam_get_library
typedef bam1_t *bam1_p;
stack->a[stack->n++] = b;
}
-static inline void dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr)
+static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr)
{
int i;
for (i = 0; i != stack->n; ++i) {
- sam_write1(out, hdr, stack->a[i]);
+ if (sam_write1(out, hdr, stack->a[i]) < 0) return -1;
bam_destroy1(stack->a[i]);
+ stack->a[i] = NULL;
}
stack->n = 0;
+ return 0;
+}
+
+static inline void clear_stack(tmp_stack_t *stack) {
+ int i;
+ if (!stack->a) return;
+ for (i = 0; i != stack->n; ++i) {
+ bam_destroy1(stack->a[i]);
+ }
}
static void clear_del_set(khash_t(name) *del_set)
return q;
}
-void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out)
+int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out)
{
- bam1_t *b;
- int last_tid = -1, last_pos = -1;
+ bam1_t *b = NULL;
+ int last_tid = -1, last_pos = -1, r;
tmp_stack_t stack;
khint_t k;
- khash_t(lib) *aux;
- khash_t(name) *del_set;
+ khash_t(lib) *aux = NULL;
+ khash_t(name) *del_set = NULL;
+ memset(&stack, 0, sizeof(tmp_stack_t));
aux = kh_init(lib);
del_set = kh_init(name);
b = bam_init1();
- memset(&stack, 0, sizeof(tmp_stack_t));
+ if (!aux || !del_set || !b) {
+ perror(__func__);
+ goto fail;
+ }
kh_resize(name, del_set, 4 * BUFFER_SIZE);
- while (sam_read1(in, hdr, b) >= 0) {
+ while ((r = sam_read1(in, hdr, b)) >= 0) {
bam1_core_t *c = &b->core;
if (c->tid != last_tid || last_pos != c->pos) {
- dump_best(&stack, out, hdr); // write the result
+ if (dump_best(&stack, out, hdr) < 0) goto write_fail; // write the result
clear_best(aux, BUFFER_SIZE);
if (c->tid != last_tid) {
clear_best(aux, 0);
if (kh_size(del_set)) { // check
- fprintf(pysamerr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+ fprintf(pysam_stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
clear_del_set(del_set);
}
if ((int)c->tid == -1) { // append unmapped reads
- sam_write1(out, hdr, b);
- while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b);
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
+ while ((r = sam_read1(in, hdr, b)) >= 0) {
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
+ }
break;
}
last_tid = c->tid;
- fprintf(pysamerr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]);
+ fprintf(pysam_stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]);
}
}
if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
- sam_write1(out, hdr, b);
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
bam_copy1(p, b); // replaced as b
} else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed
if (ret == 0)
- fprintf(pysamerr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b));
+ fprintf(pysam_stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b));
} else { // not found in best_hash
kh_val(q->best_hash, k) = bam_dup1(b);
stack_insert(&stack, kh_val(q->best_hash, k));
if (k != kh_end(del_set)) {
free((char*)kh_key(del_set, k));
kh_del(name, del_set, k);
- } else sam_write1(out, hdr, b);
+ } else {
+ if (sam_write1(out, hdr, b) < 0) goto write_fail;
+ }
}
last_pos = c->pos;
}
+ if (r < -1) {
+ fprintf(pysam_stderr, "[%s] failed to read input file\n", __func__);
+ goto fail;
+ }
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
lib_aux_t *q = &kh_val(aux, k);
- dump_best(&stack, out, hdr);
- fprintf(pysamerr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+ if (dump_best(&stack, out, hdr) < 0) goto write_fail;
+ fprintf(pysam_stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
kh_destroy(pos, q->best_hash);
free((char*)kh_key(aux, k));
+ kh_del(lib, aux, k);
}
}
kh_destroy(lib, aux);
kh_destroy(name, del_set);
free(stack.a);
bam_destroy1(b);
+ return 0;
+
+ write_fail:
+ print_error_errno("rmdup", "failed to write record");
+ fail:
+ clear_stack(&stack);
+ free(stack.a);
+ if (aux) {
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ kh_destroy(pos, q->best_hash);
+ free((char*)kh_key(aux, k));
+ }
+ }
+ kh_destroy(lib, aux);
+ }
+ if (del_set) {
+ clear_del_set(del_set);
+ kh_destroy(name, del_set);
+ }
+ bam_destroy1(b);
+ return 1;
}
-void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se);
+int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se);
static int rmdup_usage(void) {
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
- fprintf(pysamerr, "Option: -s rmdup for SE reads\n");
- fprintf(pysamerr, " -S treat PE reads as SE in rmdup (force -s)\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+ fprintf(pysam_stderr, "Option: -s rmdup for SE reads\n");
+ fprintf(pysam_stderr, " -S treat PE reads as SE in rmdup (force -s)\n");
- sam_global_opt_help(pysamerr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....");
return 1;
}
int bam_rmdup(int argc, char *argv[])
{
- int c, is_se = 0, force_se = 0;
+ int c, ret, is_se = 0, force_se = 0;
samFile *in, *out;
bam_hdr_t *header;
char wmode[3] = {'w', 'b', 0};
return rmdup_usage();
in = sam_open_format(argv[optind], "r", &ga.in);
+ if (!in) {
+ print_error_errno("rmdup", "failed to open \"%s\" for input", argv[optind]);
+ return 1;
+ }
header = sam_hdr_read(in);
if (header == NULL || header->n_targets == 0) {
- fprintf(pysamerr, "[bam_rmdup] input SAM does not have header. Abort!\n");
+ fprintf(pysam_stderr, "[bam_rmdup] input SAM does not have header. Abort!\n");
return 1;
}
sam_open_mode(wmode+1, argv[optind+1], NULL);
out = sam_open_format(argv[optind+1], wmode, &ga.out);
- if (in == 0 || out == 0) {
- fprintf(pysamerr, "[bam_rmdup] fail to read/write input files\n");
+ if (!out) {
+ print_error_errno("rmdup", "failed to open \"%s\" for output", argv[optind+1]);
+ return 1;
+ }
+ if (sam_hdr_write(out, header) < 0) {
+ print_error_errno("rmdup", "failed to write header");
return 1;
}
- sam_hdr_write(out, header);
- if (is_se) bam_rmdupse_core(in, header, out, force_se);
- else bam_rmdup_core(in, header, out);
+ if (is_se) ret = bam_rmdupse_core(in, header, out, force_se);
+ else ret = bam_rmdup_core(in, header, out);
+
bam_hdr_destroy(header);
- sam_close(in); sam_close(out);
- return 0;
+ sam_close(in);
+ if (sam_close(out) < 0) {
+ fprintf(pysam_stderr, "[bam_rmdup] error closing output file\n");
+ ret = 1;
+ }
+ return ret;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include <stdio.h>
#include "bam.h" // for bam_get_library
#include "htslib/sam.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
+#include "samtools.h"
#define QUEUE_CLEAR_SIZE 0x100000
#define MAX_POS 0x7fffffff
kh_del(best, h, k);
}
-static void dump_alignment(samFile *out, bam_hdr_t *hdr,
- queue_t *queue, int32_t pos, khash_t(lib) *h)
+static int dump_alignment(samFile *out, bam_hdr_t *hdr,
+ queue_t *queue, int32_t pos, khash_t(lib) *h)
{
if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
khint_t k;
continue;
}
if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
- sam_write1(out, hdr, q->b);
+ if (sam_write1(out, hdr, q->b) < 0) return -1;
q->b->l_data = 0;
kl_shift(q, queue, 0);
}
}
}
}
+ return 0;
}
-void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se)
+int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se)
{
- bam1_t *b;
- queue_t *queue;
+ bam1_t *b = NULL;
+ queue_t *queue = NULL;
khint_t k;
- int last_tid = -2;
- khash_t(lib) *aux;
+ int last_tid = -2, r;
+ khash_t(lib) *aux = NULL;
aux = kh_init(lib);
b = bam_init1();
queue = kl_init(q);
- while (sam_read1(in, hdr, b) >= 0) {
+ if (!aux || !b || !queue) {
+ perror(__func__);
+ goto fail;
+ }
+
+ while ((r = sam_read1(in, hdr, b)) >= 0) {
bam1_core_t *c = &b->core;
int endpos = bam_endpos(b);
int score = sum_qual(b);
if (last_tid != c->tid) {
- if (last_tid >= 0) dump_alignment(out, hdr, queue, MAX_POS, aux);
+ if (last_tid >= 0) {
+ if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0)
+ goto write_fail;
+ }
last_tid = c->tid;
- } else dump_alignment(out, hdr, queue, c->pos, aux);
+ } else {
+ if (dump_alignment(out, hdr, queue, c->pos, aux) < 0)
+ goto write_fail;
+ }
if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
push_queue(queue, b, endpos, score);
} else {
} else kh_val(h, k) = push_queue(queue, b, endpos, score);
}
}
- dump_alignment(out, hdr, queue, MAX_POS, aux);
+ if (r < -1) {
+ fprintf(stderr, "[%s] error reading input file\n", __func__);
+ goto fail;
+ }
+
+ if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0) goto write_fail;
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
kh_destroy(best, q->left); kh_destroy(best, q->rght);
free((char*)kh_key(aux, k));
+ kh_del(lib, aux, k);
}
}
kh_destroy(lib, aux);
bam_destroy1(b);
kl_destroy(q, queue);
+ return 0;
+
+ write_fail:
+ print_error_errno("rmdup", "failed to write record");
+ fail:
+ if (aux) {
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ kh_destroy(best, q->left);
+ kh_destroy(best, q->rght);
+ free((char*)kh_key(aux, k));
+ }
+ }
+ kh_destroy(lib, aux);
+ }
+ bam_destroy1(b);
+ kl_destroy(q, queue);
+ return 1;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include <stdio.h>
#include "bam.h" // for bam_get_library
#include "htslib/sam.h"
#include "htslib/khash.h"
#include "htslib/klist.h"
+#include "samtools.h"
#define QUEUE_CLEAR_SIZE 0x100000
#define MAX_POS 0x7fffffff
kh_del(best, h, k);
}
-static void dump_alignment(samFile *out, bam_hdr_t *hdr,
- queue_t *queue, int32_t pos, khash_t(lib) *h)
+static int dump_alignment(samFile *out, bam_hdr_t *hdr,
+ queue_t *queue, int32_t pos, khash_t(lib) *h)
{
if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
khint_t k;
continue;
}
if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
- sam_write1(out, hdr, q->b);
+ if (sam_write1(out, hdr, q->b) < 0) return -1;
q->b->l_data = 0;
kl_shift(q, queue, 0);
}
}
}
}
+ return 0;
}
-void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se)
+int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se)
{
- bam1_t *b;
- queue_t *queue;
+ bam1_t *b = NULL;
+ queue_t *queue = NULL;
khint_t k;
- int last_tid = -2;
- khash_t(lib) *aux;
+ int last_tid = -2, r;
+ khash_t(lib) *aux = NULL;
aux = kh_init(lib);
b = bam_init1();
queue = kl_init(q);
- while (sam_read1(in, hdr, b) >= 0) {
+ if (!aux || !b || !queue) {
+ perror(__func__);
+ goto fail;
+ }
+
+ while ((r = sam_read1(in, hdr, b)) >= 0) {
bam1_core_t *c = &b->core;
int endpos = bam_endpos(b);
int score = sum_qual(b);
if (last_tid != c->tid) {
- if (last_tid >= 0) dump_alignment(out, hdr, queue, MAX_POS, aux);
+ if (last_tid >= 0) {
+ if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0)
+ goto write_fail;
+ }
last_tid = c->tid;
- } else dump_alignment(out, hdr, queue, c->pos, aux);
+ } else {
+ if (dump_alignment(out, hdr, queue, c->pos, aux) < 0)
+ goto write_fail;
+ }
if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
push_queue(queue, b, endpos, score);
} else {
} else kh_val(h, k) = push_queue(queue, b, endpos, score);
}
}
- dump_alignment(out, hdr, queue, MAX_POS, aux);
+ if (r < -1) {
+ fprintf(pysam_stderr, "[%s] error reading input file\n", __func__);
+ goto fail;
+ }
+
+ if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0) goto write_fail;
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
lib_aux_t *q = &kh_val(aux, k);
- fprintf(pysamerr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+ fprintf(pysam_stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
kh_destroy(best, q->left); kh_destroy(best, q->rght);
free((char*)kh_key(aux, k));
+ kh_del(lib, aux, k);
}
}
kh_destroy(lib, aux);
bam_destroy1(b);
kl_destroy(q, queue);
+ return 0;
+
+ write_fail:
+ print_error_errno("rmdup", "failed to write record");
+ fail:
+ if (aux) {
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ kh_destroy(best, q->left);
+ kh_destroy(best, q->rght);
+ free((char*)kh_key(aux, k));
+ }
+ }
+ kh_destroy(lib, aux);
+ }
+ bam_destroy1(b);
+ kl_destroy(q, queue);
+ return 1;
}
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2015 Genome Research Ltd.
+ Copyright (C) 2008-2016 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
+#include <sys/stat.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>
hdr_match_t *new_sq_matches = NULL;
char *text;
hdr_match_t matches[2];
- int32_t i, missing;
+ int32_t i;
int32_t old_n_targets = merged_hdr->n_targets;
khiter_t iter;
int min_tid = -1;
text += matches[0].rm_eo;
}
- // Check if any new targets have been missed
- missing = 0;
+ // Copy the @SQ headers found and recreate any missing from binary header.
for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) {
if (new_sq_matches[i].rm_so >= 0) {
if (match_to_ks(translate->text, &new_sq_matches[i], out_text))
goto memfail;
if (kputc('\n', out_text) == EOF) goto memfail;
} else {
- fprintf(stderr, "[E::%s] @SQ SN (%s) found in binary header but not text header.\n",
- __func__, merged_hdr->target_name[i + old_n_targets]);
- missing++;
+ if (kputs("@SQ\tSN:", out_text) == EOF ||
+ kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF ||
+ kputs("\tLN:", out_text) == EOF ||
+ kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF ||
+ kputc('\n', out_text) == EOF) goto memfail;
}
}
- if (missing) goto fail;
free(new_sq_matches);
return 0;
static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate,
trans_tbl_t* tbl, bool merge_rg, bool merge_pg,
- char* rg_override)
+ bool copy_co, char* rg_override)
{
klist_t(hdrln) *rg_list = NULL;
klist_t(hdrln) *pg_list = NULL;
kl_destroy(hdrln, rg_list); rg_list = NULL;
kl_destroy(hdrln, pg_list); pg_list = NULL;
- // Just append @CO headers without translation
- const char *line, *end_pointer;
- for (line = translate->text; *line; line = end_pointer + 1) {
- end_pointer = strchr(line, '\n');
- if (strncmp(line, "@CO", 3) == 0) {
- if (end_pointer) {
- if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF)
- goto memfail;
- } else { // Last line with no trailing '\n'
- if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail;
- if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail;
+ if (copy_co) {
+ // Just append @CO headers without translation
+ const char *line, *end_pointer;
+ for (line = translate->text; *line; line = end_pointer + 1) {
+ end_pointer = strchr(line, '\n');
+ if (strncmp(line, "@CO", 3) == 0) {
+ if (end_pointer) {
+ if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF)
+ goto memfail;
+ } else { // Last line with no trailing '\n'
+ if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail;
+ if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail;
+ }
}
+ if (end_pointer == NULL) break;
}
- if (end_pointer == NULL) break;
}
return 0;
// Create reverse translation table for tids
int* rtrans = (int*)malloc(sizeof(int32_t)*n*n_targets);
const int32_t NOTID = INT32_MIN;
+ if (!rtrans) return NULL;
memset_pattern4((void*)rtrans, &NOTID, sizeof(int32_t)*n*n_targets);
int i;
for (i = 0; i < n; ++i) {
#define MERGE_FORCE 8 // Overwrite output BAM if it exists
#define MERGE_COMBINE_RG 16 // Combine RG tags frather than redefining them
#define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them
+#define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only)
/*
* How merging is handled
const char *reg, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
- samFile *fpout, **fp;
- heap1_t *heap;
+ samFile *fpout, **fp = NULL;
+ heap1_t *heap = NULL;
bam_hdr_t *hout = NULL;
bam_hdr_t *hin = NULL;
int i, j, *RG_len = NULL;
hts_itr_t **iter = NULL;
bam_hdr_t **hdr = NULL;
trans_tbl_t *translation_tbl = NULL;
+ int *rtrans = NULL;
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
if (hin == NULL) {
fprintf(stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
headers);
- return -1;
+ goto mem_fail;
+ }
+ } else {
+ hout = bam_hdr_init();
+ if (!hout) {
+ fprintf(stderr, "[bam_merge_core] couldn't allocate bam header\n");
+ goto mem_fail;
}
+ hout->text = strdup("");
+ if (!hout->text) goto mem_fail;
}
g_is_by_qname = by_qname;
fp = (samFile**)calloc(n, sizeof(samFile*));
+ if (!fp) goto mem_fail;
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+ if (!heap) goto mem_fail;
iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
+ if (!iter) goto mem_fail;
hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
+ if (!hdr) goto mem_fail;
translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
+ if (!translation_tbl) goto mem_fail;
RG = (char**)calloc(n, sizeof(char*));
+ if (!RG) goto mem_fail;
+
// prepare RG tag from file names
if (flag & MERGE_RG) {
RG_len = (int*)calloc(n, sizeof(int));
+ if (!RG_len) goto mem_fail;
for (i = 0; i != n; ++i) {
int l = strlen(fn[i]);
const char *s = fn[i];
for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
++j; l -= j;
RG[i] = (char*)calloc(l + 1, 1);
+ if (!RG[i]) goto mem_fail;
RG_len[i] = l;
strncpy(RG[i], s + j, l);
}
trans_tbl_t dummy;
int res;
res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
- flag & MERGE_COMBINE_PG, NULL);
+ flag & MERGE_COMBINE_PG, true, NULL);
trans_tbl_destroy(&dummy);
if (res) return -1; // FIXME: memory leak
}
bam_hdr_t *hin;
fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
- int j;
fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
- for (j = 0; j < i; ++j) {
- bam_hdr_destroy(hdr[i]);
- sam_close(fp[j]);
- }
- free(fp); free(heap);
- // FIXME: possible memory leak
- return -1;
+ goto fail;
}
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
fprintf(stderr, "[bam_merge_core] failed to read header for '%s'\n",
fn[i]);
- for (j = 0; j < i; ++j) {
- bam_hdr_destroy(hdr[i]);
- sam_close(fp[j]);
- }
- free(fp); free(heap);
- // FIXME: possible memory leak
- return -1;
+ goto fail;
}
if (trans_tbl_init(merged_hdr, hin, translation_tbl+i,
flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG,
+ (flag & MERGE_FIRST_CO)? (i == 0) : true,
RG[i]))
return -1; // FIXME: memory leak
// If we're only merging a specified region move our iters to start at that point
if (reg) {
- int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl);
-
int tid, beg, end;
- const char *name_lim = hts_parse_reg(reg, &beg, &end);
+ const char *name_lim;
+
+ rtrans = rtrans_build(n, hout->n_targets, translation_tbl);
+ if (!rtrans) goto mem_fail;
+
+ name_lim = hts_parse_reg(reg, &beg, &end);
if (name_lim) {
char *name = malloc(name_lim - reg + 1);
+ if (!name) goto mem_fail;
memcpy(name, reg, name_lim - reg);
name[name_lim - reg] = '\0';
tid = bam_name2id(hout, name);
if (tid < 0) {
if (name_lim) fprintf(stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg);
else fprintf(stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg);
- return -1;
+ goto fail;
}
for (i = 0; i < n; ++i) {
hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
if (idx == NULL) {
fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
__func__, fn[i]);
- return -1;
+ goto fail;
}
if (mapped_tid != INT32_MIN) {
iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
}
hts_idx_destroy(idx);
- if (iter[i] == NULL) break;
+ if (iter[i] == NULL) {
+ if (mapped_tid != INT32_MIN) {
+ fprintf(stderr,
+ "[%s] failed to get iterator over "
+ "{%s, %d, %d, %d}\n",
+ __func__, fn[i], mapped_tid, beg, end);
+ } else {
+ fprintf(stderr,
+ "[%s] failed to get iterator over "
+ "{%s, HTS_IDX_NONE, 0, 0}\n",
+ __func__, fn[i]);
+ }
+ goto fail;
+ }
}
free(rtrans);
+ rtrans = NULL;
} else {
for (i = 0; i < n; ++i) {
if (hdr[i] == NULL) {
iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
- if (iter[i] == NULL) break;
+ if (iter[i] == NULL) {
+ fprintf(stderr, "[%s] failed to get iterator\n", __func__);
+ goto fail;
+ }
}
else iter[i] = NULL;
}
}
- if (i < n) {
- fprintf(stderr, "[%s] Memory allocation failed\n", __func__);
- return -1;
- }
-
// Load the first read from each file into the heap
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
+ int res;
h->i = i;
h->b = bam_init1();
- if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
+ if (!h->b) goto mem_fail;
+ res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b);
+ if (res >= 0) {
bam_translate(h->b, translation_tbl + i);
h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
h->idx = idx++;
}
- else {
+ else if (res == -1 && (!iter[i] || iter[i]->finished)) {
h->pos = HEAP_EMPTY;
bam_destroy1(h->b);
h->b = NULL;
+ } else {
+ fprintf(stderr, "[%s] failed to read first record from %s\n",
+ __func__, fn[i]);
+ goto fail;
}
}
// Open output file and write header
if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
- fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
+ fprintf(stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno));
+ return -1;
+ }
+ if (sam_hdr_write(fpout, hout) != 0) {
+ fprintf(stderr, "[%s] failed to write header.\n", __func__);
+ sam_close(fpout);
return -1;
}
- sam_hdr_write(fpout, hout);
if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);
// Begin the actual merge
if (rg) bam_aux_del(b, rg);
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
- sam_write1(fpout, hout, b);
+ if (sam_write1(fpout, hout, b) < 0) {
+ fprintf(stderr, "[%s] failed to write to output file.\n", __func__);
+ sam_close(fpout);
+ return -1;
+ }
if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
bam_translate(b, translation_tbl + heap->i);
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->idx = idx++;
- } else if (j == -1) {
+ } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
heap->pos = HEAP_EMPTY;
bam_destroy1(heap->b);
heap->b = NULL;
- } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+ } else {
+ fprintf(stderr, "[bam_merge_core] error: '%s' is truncated.\n",
+ fn[heap->i]);
+ goto fail;
+ }
ks_heapadjust(heap, 0, n, heap);
}
bam_hdr_destroy(hin);
bam_hdr_destroy(hout);
free_merged_header(merged_hdr);
- sam_close(fpout);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
+ if (sam_close(fpout) < 0) {
+ fprintf(stderr, "[bam_merge_core] error closing output file\n");
+ return -1;
+ }
return 0;
+
+ mem_fail:
+ fprintf(stderr, "[bam_merge_core] Out of memory\n");
+
+ fail:
+ if (flag & MERGE_RG) {
+ if (RG) {
+ for (i = 0; i != n; ++i) free(RG[i]);
+ }
+ free(RG_len);
+ }
+ for (i = 0; i < n; ++i) {
+ if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i);
+ if (iter && iter[i]) hts_itr_destroy(iter[i]);
+ if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]);
+ if (fp && fp[i]) sam_close(fp[i]);
+ if (heap && heap[i].b) bam_destroy1(heap[i].b);
+ }
+ if (hout) bam_hdr_destroy(hout);
+ free(RG);
+ free(translation_tbl);
+ free(hdr);
+ free(iter);
+ free(heap);
+ free(fp);
+ free(rtrans);
+ return -1;
}
// Unused here but may be used by legacy samtools-using third-party code
"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
"\n"
"Options:\n"
-" -n Sort by read names\n"
+" -n Input files are sorted by read name\n"
" -r Attach RG tag (inferred from file names)\n"
" -u Uncompressed BAM output\n"
" -f Overwrite the output BAM if exist\n"
bam1_p *buf;
const bam_hdr_t *h;
int index;
+ int error;
} worker_t;
-static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
+// Returns 0 for success
+// -1 for failure
+static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
{
size_t i;
samFile* fp;
fp = sam_open_format(fn, mode, fmt);
- if (fp == NULL) return;
- sam_hdr_write(fp, h);
+ if (fp == NULL) return -1;
+ if (sam_hdr_write(fp, h) != 0) goto fail;
if (n_threads > 1) hts_set_threads(fp, n_threads);
- for (i = 0; i < l; ++i)
- sam_write1(fp, h, buf[i]);
+ for (i = 0; i < l; ++i) {
+ if (sam_write1(fp, h, buf[i]) < 0) goto fail;
+ }
+ if (sam_close(fp) < 0) return -1;
+ return 0;
+ fail:
sam_close(fp);
+ return -1;
}
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
char *name;
+ w->error = 0;
ks_mergesort(sort, w->buf_len, w->buf, 0);
name = (char*)calloc(strlen(w->prefix) + 20, 1);
+ if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0, NULL);
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
+ w->error = errno;
// Consider using CRAM temporary files if the final output is CRAM.
// Typically it is comparable speed while being smaller.
// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
// };
// opt[0].next = &opt[1];
-// write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt);
+// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0)
+// w->error = errno;
free(name);
return 0;
pthread_t *tid;
pthread_attr_t attr;
worker_t *w;
+ int n_failed = 0;
if (n_threads < 1) n_threads = 1;
if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
b += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
- for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+ for (i = 0; i < n_threads; ++i) {
+ pthread_join(tid[i], 0);
+ if (w[i].error != 0) {
+ fprintf(stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error));
+ n_failed++;
+ }
+ }
free(tid); free(w);
- return n_files + n_threads;
+ return (n_failed == 0)? n_files + n_threads : -1;
}
/*!
++k;
if (mem >= max_mem) {
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ if (n_files < 0) {
+ ret = -1;
+ goto err;
+ }
mem = k = 0;
}
}
// write the final output
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
- write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt);
+ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
+ fprintf(stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno));
+ ret = -1;
+ goto err;
+ }
} else { // then merge
char **fns;
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ if (n_files == -1) {
+ ret = -1;
+ goto err;
+ }
fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
fns = (char**)calloc(n_files, sizeof(char*));
for (i = 0; i < n_files; ++i) {
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
- MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads,
- in_fmt, out_fmt) < 0) {
+ MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
+ NULL, n_threads, in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
+ struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
- if (tmpprefix.l == 0)
- ksprintf(&tmpprefix, "%s.tmp", (nargs > 0)? argv[optind] : "STDIN");
+ if (tmpprefix.l == 0) {
+ if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout);
+ else kputc('.', &tmpprefix);
+ }
+ if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) {
+ unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
+ if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix);
+ ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
+ }
ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
tmpprefix.s, fnout, modeout, max_mem, n_threads,
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2015 Genome Research Ltd.
+ Copyright (C) 2008-2016 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
+#include <sys/stat.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>
hdr_match_t *new_sq_matches = NULL;
char *text;
hdr_match_t matches[2];
- int32_t i, missing;
+ int32_t i;
int32_t old_n_targets = merged_hdr->n_targets;
khiter_t iter;
int min_tid = -1;
if (iter == kh_end(sq_tids)) {
// Warn about this, but it's not really fatal.
- fprintf(pysamerr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n",
+ fprintf(pysam_stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n",
__func__,
(int) (matches[1].rm_eo - matches[1].rm_so),
text + matches[1].rm_so);
text += matches[0].rm_eo;
}
- // Check if any new targets have been missed
- missing = 0;
+ // Copy the @SQ headers found and recreate any missing from binary header.
for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) {
if (new_sq_matches[i].rm_so >= 0) {
if (match_to_ks(translate->text, &new_sq_matches[i], out_text))
goto memfail;
if (kputc('\n', out_text) == EOF) goto memfail;
} else {
- fprintf(pysamerr, "[E::%s] @SQ SN (%s) found in binary header but not text header.\n",
- __func__, merged_hdr->target_name[i + old_n_targets]);
- missing++;
+ if (kputs("@SQ\tSN:", out_text) == EOF ||
+ kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF ||
+ kputs("\tLN:", out_text) == EOF ||
+ kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF ||
+ kputc('\n', out_text) == EOF) goto memfail;
}
}
- if (missing) goto fail;
free(new_sq_matches);
return 0;
idx = kh_get(c2c, pg_map, id);
if (idx == kh_end(pg_map)) {
// Not found, warn.
- fprintf(pysamerr, "[W::%s] Tag %s%s not found in @PG records\n",
+ fprintf(pysam_stderr, "[W::%s] Tag %s%s not found in @PG records\n",
__func__, search + 1, id);
} else {
// Remember new id and splice points on original string
static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate,
trans_tbl_t* tbl, bool merge_rg, bool merge_pg,
- char* rg_override)
+ bool copy_co, char* rg_override)
{
klist_t(hdrln) *rg_list = NULL;
klist_t(hdrln) *pg_list = NULL;
kl_destroy(hdrln, rg_list); rg_list = NULL;
kl_destroy(hdrln, pg_list); pg_list = NULL;
- // Just append @CO headers without translation
- const char *line, *end_pointer;
- for (line = translate->text; *line; line = end_pointer + 1) {
- end_pointer = strchr(line, '\n');
- if (strncmp(line, "@CO", 3) == 0) {
- if (end_pointer) {
- if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF)
- goto memfail;
- } else { // Last line with no trailing '\n'
- if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail;
- if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail;
+ if (copy_co) {
+ // Just append @CO headers without translation
+ const char *line, *end_pointer;
+ for (line = translate->text; *line; line = end_pointer + 1) {
+ end_pointer = strchr(line, '\n');
+ if (strncmp(line, "@CO", 3) == 0) {
+ if (end_pointer) {
+ if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF)
+ goto memfail;
+ } else { // Last line with no trailing '\n'
+ if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail;
+ if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail;
+ }
}
+ if (end_pointer == NULL) break;
}
- if (end_pointer == NULL) break;
}
return 0;
+ ks_len(&merged_hdr->out_pg)
+ ks_len(&merged_hdr->out_co));
if (txt_sz >= INT32_MAX) {
- fprintf(pysamerr, "[%s] Output header text too long\n", __func__);
+ fprintf(pysam_stderr, "[%s] Output header text too long\n", __func__);
return NULL;
}
}
} else {
char *tmp = strdup(decoded_rg);
- fprintf(pysamerr,
+ fprintf(pysam_stderr,
"[bam_translate] RG tag \"%s\" on read \"%s\" encountered "
"with no corresponding entry in header, tag lost. "
"Unknown tags are only reported once per input file for "
}
} else {
char *tmp = strdup(decoded_pg);
- fprintf(pysamerr,
+ fprintf(pysam_stderr,
"[bam_translate] PG tag \"%s\" on read \"%s\" encountered "
"with no corresponding entry in header, tag lost. "
"Unknown tags are only reported once per input file for "
// Create reverse translation table for tids
int* rtrans = (int*)malloc(sizeof(int32_t)*n*n_targets);
const int32_t NOTID = INT32_MIN;
+ if (!rtrans) return NULL;
memset_pattern4((void*)rtrans, &NOTID, sizeof(int32_t)*n*n_targets);
int i;
for (i = 0; i < n; ++i) {
#define MERGE_FORCE 8 // Overwrite output BAM if it exists
#define MERGE_COMBINE_RG 16 // Combine RG tags frather than redefining them
#define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them
+#define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only)
/*
* How merging is handled
const char *reg, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
- samFile *fpout, **fp;
- heap1_t *heap;
+ samFile *fpout, **fp = NULL;
+ heap1_t *heap = NULL;
bam_hdr_t *hout = NULL;
bam_hdr_t *hin = NULL;
int i, j, *RG_len = NULL;
hts_itr_t **iter = NULL;
bam_hdr_t **hdr = NULL;
trans_tbl_t *translation_tbl = NULL;
+ int *rtrans = NULL;
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
samFile* fpheaders = sam_open(headers, "r");
if (fpheaders == NULL) {
const char *message = strerror(errno);
- fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ fprintf(pysam_stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
return -1;
}
hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
if (hin == NULL) {
- fprintf(pysamerr, "[bam_merge_core] couldn't read headers for '%s'\n",
+ fprintf(pysam_stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
headers);
- return -1;
+ goto mem_fail;
+ }
+ } else {
+ hout = bam_hdr_init();
+ if (!hout) {
+ fprintf(pysam_stderr, "[bam_merge_core] couldn't allocate bam header\n");
+ goto mem_fail;
}
+ hout->text = strdup("");
+ if (!hout->text) goto mem_fail;
}
g_is_by_qname = by_qname;
fp = (samFile**)calloc(n, sizeof(samFile*));
+ if (!fp) goto mem_fail;
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+ if (!heap) goto mem_fail;
iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
+ if (!iter) goto mem_fail;
hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
+ if (!hdr) goto mem_fail;
translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
+ if (!translation_tbl) goto mem_fail;
RG = (char**)calloc(n, sizeof(char*));
+ if (!RG) goto mem_fail;
+
// prepare RG tag from file names
if (flag & MERGE_RG) {
RG_len = (int*)calloc(n, sizeof(int));
+ if (!RG_len) goto mem_fail;
for (i = 0; i != n; ++i) {
int l = strlen(fn[i]);
const char *s = fn[i];
for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
++j; l -= j;
RG[i] = (char*)calloc(l + 1, 1);
+ if (!RG[i]) goto mem_fail;
RG_len[i] = l;
strncpy(RG[i], s + j, l);
}
trans_tbl_t dummy;
int res;
res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
- flag & MERGE_COMBINE_PG, NULL);
+ flag & MERGE_COMBINE_PG, true, NULL);
trans_tbl_destroy(&dummy);
if (res) return -1; // FIXME: memory leak
}
bam_hdr_t *hin;
fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
- int j;
- fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]);
- for (j = 0; j < i; ++j) {
- bam_hdr_destroy(hdr[i]);
- sam_close(fp[j]);
- }
- free(fp); free(heap);
- // FIXME: possible memory leak
- return -1;
+ fprintf(pysam_stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ goto fail;
}
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
- fprintf(pysamerr, "[bam_merge_core] failed to read header for '%s'\n",
+ fprintf(pysam_stderr, "[bam_merge_core] failed to read header for '%s'\n",
fn[i]);
- for (j = 0; j < i; ++j) {
- bam_hdr_destroy(hdr[i]);
- sam_close(fp[j]);
- }
- free(fp); free(heap);
- // FIXME: possible memory leak
- return -1;
+ goto fail;
}
if (trans_tbl_init(merged_hdr, hin, translation_tbl+i,
flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG,
+ (flag & MERGE_FIRST_CO)? (i == 0) : true,
RG[i]))
return -1; // FIXME: memory leak
else { bam_hdr_destroy(hin); hdr[i] = NULL; }
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
- fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
+ fprintf(pysam_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
}
// Did we get an @HD line?
if (!merged_hdr->have_hd) {
- fprintf(pysamerr, "[W::%s] No @HD tag found.\n", __func__);
+ fprintf(pysam_stderr, "[W::%s] No @HD tag found.\n", __func__);
/* FIXME: Should we add an @HD line here, and if so what should
we put in it? Ideally we want a way of getting htslib to tell
us the SAM version number to assume given no @HD line. Is
// If we're only merging a specified region move our iters to start at that point
if (reg) {
- int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl);
-
int tid, beg, end;
- const char *name_lim = hts_parse_reg(reg, &beg, &end);
+ const char *name_lim;
+
+ rtrans = rtrans_build(n, hout->n_targets, translation_tbl);
+ if (!rtrans) goto mem_fail;
+
+ name_lim = hts_parse_reg(reg, &beg, &end);
if (name_lim) {
char *name = malloc(name_lim - reg + 1);
+ if (!name) goto mem_fail;
memcpy(name, reg, name_lim - reg);
name[name_lim - reg] = '\0';
tid = bam_name2id(hout, name);
end = INT_MAX;
}
if (tid < 0) {
- if (name_lim) fprintf(pysamerr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg);
- else fprintf(pysamerr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg);
- return -1;
+ if (name_lim) fprintf(pysam_stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg);
+ else fprintf(pysam_stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg);
+ goto fail;
}
for (i = 0; i < n; ++i) {
hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
// (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
int mapped_tid = rtrans[i*hout->n_targets+tid];
if (idx == NULL) {
- fprintf(pysamerr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+ fprintf(pysam_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
__func__, fn[i]);
- return -1;
+ goto fail;
}
if (mapped_tid != INT32_MIN) {
iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
}
hts_idx_destroy(idx);
- if (iter[i] == NULL) break;
+ if (iter[i] == NULL) {
+ if (mapped_tid != INT32_MIN) {
+ fprintf(pysam_stderr,
+ "[%s] failed to get iterator over "
+ "{%s, %d, %d, %d}\n",
+ __func__, fn[i], mapped_tid, beg, end);
+ } else {
+ fprintf(pysam_stderr,
+ "[%s] failed to get iterator over "
+ "{%s, HTS_IDX_NONE, 0, 0}\n",
+ __func__, fn[i]);
+ }
+ goto fail;
+ }
}
free(rtrans);
+ rtrans = NULL;
} else {
for (i = 0; i < n; ++i) {
if (hdr[i] == NULL) {
iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
- if (iter[i] == NULL) break;
+ if (iter[i] == NULL) {
+ fprintf(pysam_stderr, "[%s] failed to get iterator\n", __func__);
+ goto fail;
+ }
}
else iter[i] = NULL;
}
}
- if (i < n) {
- fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__);
- return -1;
- }
-
// Load the first read from each file into the heap
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
+ int res;
h->i = i;
h->b = bam_init1();
- if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
+ if (!h->b) goto mem_fail;
+ res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b);
+ if (res >= 0) {
bam_translate(h->b, translation_tbl + i);
h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
h->idx = idx++;
}
- else {
+ else if (res == -1 && (!iter[i] || iter[i]->finished)) {
h->pos = HEAP_EMPTY;
bam_destroy1(h->b);
h->b = NULL;
+ } else {
+ fprintf(pysam_stderr, "[%s] failed to read first record from %s\n",
+ __func__, fn[i]);
+ goto fail;
}
}
// Open output file and write header
if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
- fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__);
+ fprintf(pysam_stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno));
+ return -1;
+ }
+ if (sam_hdr_write(fpout, hout) != 0) {
+ fprintf(pysam_stderr, "[%s] failed to write header.\n", __func__);
+ sam_close(fpout);
return -1;
}
- sam_hdr_write(fpout, hout);
if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);
// Begin the actual merge
if (rg) bam_aux_del(b, rg);
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
- sam_write1(fpout, hout, b);
+ if (sam_write1(fpout, hout, b) < 0) {
+ fprintf(pysam_stderr, "[%s] failed to write to output file.\n", __func__);
+ sam_close(fpout);
+ return -1;
+ }
if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
bam_translate(b, translation_tbl + heap->i);
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->idx = idx++;
- } else if (j == -1) {
+ } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
heap->pos = HEAP_EMPTY;
bam_destroy1(heap->b);
heap->b = NULL;
- } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+ } else {
+ fprintf(pysam_stderr, "[bam_merge_core] error: '%s' is truncated.\n",
+ fn[heap->i]);
+ goto fail;
+ }
ks_heapadjust(heap, 0, n, heap);
}
bam_hdr_destroy(hin);
bam_hdr_destroy(hout);
free_merged_header(merged_hdr);
- sam_close(fpout);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
+ if (sam_close(fpout) < 0) {
+ fprintf(pysam_stderr, "[bam_merge_core] error closing output file\n");
+ return -1;
+ }
return 0;
+
+ mem_fail:
+ fprintf(pysam_stderr, "[bam_merge_core] Out of memory\n");
+
+ fail:
+ if (flag & MERGE_RG) {
+ if (RG) {
+ for (i = 0; i != n; ++i) free(RG[i]);
+ }
+ free(RG_len);
+ }
+ for (i = 0; i < n; ++i) {
+ if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i);
+ if (iter && iter[i]) hts_itr_destroy(iter[i]);
+ if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]);
+ if (fp && fp[i]) sam_close(fp[i]);
+ if (heap && heap[i].b) bam_destroy1(heap[i].b);
+ }
+ if (hout) bam_hdr_destroy(hout);
+ free(RG);
+ free(translation_tbl);
+ free(hdr);
+ free(iter);
+ free(heap);
+ free(fp);
+ free(rtrans);
+ return -1;
}
// Unused here but may be used by legacy samtools-using third-party code
"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
"\n"
"Options:\n"
-" -n Sort by read names\n"
+" -n Input files are sorted by read name\n"
" -r Attach RG tag (inferred from file names)\n"
" -u Uncompressed BAM output\n"
" -f Overwrite the output BAM if exist\n"
};
if (argc == 1) {
- merge_usage(stdout);
+ merge_usage(pysam_stdout);
return 0;
}
fn_size += nfiles;
}
else {
- fprintf(pysamerr, "[%s] Invalid file list \"%s\"\n", __func__, optarg);
+ fprintf(pysam_stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg);
ret = 1;
}
break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
- case '?': merge_usage(pysamerr); return 1;
+ case '?': merge_usage(pysam_stderr); return 1;
}
}
if ( argc - optind < 1 ) {
- fprintf(pysamerr, "You must at least specify the output file.\n");
- merge_usage(pysamerr);
+ fprintf(pysam_stderr, "You must at least specify the output file.\n");
+ merge_usage(pysam_stderr);
return 1;
}
FILE *fp = fopen(argv[optind], "rb");
if (fp != NULL) {
fclose(fp);
- fprintf(pysamerr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
+ fprintf(pysam_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
return 1;
}
}
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
if (fn_size+nargcfiles < 1) {
- fprintf(pysamerr, "You must specify at least one (and usually two or more) input files.\n");
- merge_usage(pysamerr);
+ fprintf(pysam_stderr, "You must specify at least one (and usually two or more) input files.\n");
+ merge_usage(pysam_stderr);
return 1;
}
strcpy(mode, "wb");
bam1_p *buf;
const bam_hdr_t *h;
int index;
+ int error;
} worker_t;
-static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
+// Returns 0 for success
+// -1 for failure
+static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt)
{
size_t i;
samFile* fp;
fp = sam_open_format(fn, mode, fmt);
- if (fp == NULL) return;
- sam_hdr_write(fp, h);
+ if (fp == NULL) return -1;
+ if (sam_hdr_write(fp, h) != 0) goto fail;
if (n_threads > 1) hts_set_threads(fp, n_threads);
- for (i = 0; i < l; ++i)
- sam_write1(fp, h, buf[i]);
+ for (i = 0; i < l; ++i) {
+ if (sam_write1(fp, h, buf[i]) < 0) goto fail;
+ }
+ if (sam_close(fp) < 0) return -1;
+ return 0;
+ fail:
sam_close(fp);
+ return -1;
}
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
char *name;
+ w->error = 0;
ks_mergesort(sort, w->buf_len, w->buf, 0);
name = (char*)calloc(strlen(w->prefix) + 20, 1);
+ if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0, NULL);
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
+ w->error = errno;
// Consider using CRAM temporary files if the final output is CRAM.
// Typically it is comparable speed while being smaller.
// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
// };
// opt[0].next = &opt[1];
-// write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt);
+// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0)
+// w->error = errno;
free(name);
return 0;
pthread_t *tid;
pthread_attr_t attr;
worker_t *w;
+ int n_failed = 0;
if (n_threads < 1) n_threads = 1;
if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
b += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
- for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+ for (i = 0; i < n_threads; ++i) {
+ pthread_join(tid[i], 0);
+ if (w[i].error != 0) {
+ fprintf(pysam_stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error));
+ n_failed++;
+ }
+ }
free(tid); free(w);
- return n_files + n_threads;
+ return (n_failed == 0)? n_files + n_threads : -1;
}
/*!
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
const char *message = strerror(errno);
- fprintf(pysamerr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ fprintf(pysam_stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
return -2;
}
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysamerr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ fprintf(pysam_stderr, "[bam_sort_core] failed to read header for '%s'\n", fn);
goto err;
}
if (is_by_qname) change_SO(header, "queryname");
++k;
if (mem >= max_mem) {
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ if (n_files < 0) {
+ ret = -1;
+ goto err;
+ }
mem = k = 0;
}
}
if (ret != -1) {
- fprintf(pysamerr, "[bam_sort_core] truncated file. Aborting.\n");
+ fprintf(pysam_stderr, "[bam_sort_core] truncated file. Aborting.\n");
ret = -1;
goto err;
}
// write the final output
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
- write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt);
+ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
+ fprintf(pysam_stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno));
+ ret = -1;
+ goto err;
+ }
} else { // then merge
char **fns;
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
- fprintf(pysamerr, "[bam_sort_core] merging from %d files...\n", n_files);
+ if (n_files == -1) {
+ ret = -1;
+ goto err;
+ }
+ fprintf(pysam_stderr, "[bam_sort_core] merging from %d files...\n", n_files);
fns = (char**)calloc(n_files, sizeof(char*));
for (i = 0; i < n_files; ++i) {
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
- MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads,
- in_fmt, out_fmt) < 0) {
+ MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
+ NULL, n_threads, in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
+ struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
- case '?': sort_usage(pysamerr); ret = EXIT_FAILURE; goto sort_end;
+ case '?': sort_usage(pysam_stderr); ret = EXIT_FAILURE; goto sort_end;
}
}
nargs = argc - optind;
if (nargs == 0 && isatty(STDIN_FILENO)) {
- sort_usage(stdout);
+ sort_usage(pysam_stdout);
ret = EXIT_SUCCESS;
goto sort_end;
}
else if (nargs >= 2) {
// If exactly two, user probably tried to specify legacy <out.prefix>
if (nargs == 2)
- fprintf(pysamerr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n");
+ fprintf(pysam_stderr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n");
- sort_usage(pysamerr);
+ sort_usage(pysam_stderr);
ret = EXIT_FAILURE;
goto sort_end;
}
sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
- if (tmpprefix.l == 0)
- ksprintf(&tmpprefix, "%s.tmp", (nargs > 0)? argv[optind] : "STDIN");
+ if (tmpprefix.l == 0) {
+ if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout);
+ else kputc('.', &tmpprefix);
+ }
+ if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) {
+ unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
+ if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix);
+ ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
+ }
ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
tmpprefix.s, fnout, modeout, max_mem, n_threads,
// If we failed on opening the input file & it has no .bam/.cram/etc
// extension, the user probably tried legacy -o <infile> <out.prefix>
if (ret == -2 && o_seen && nargs > 0 && sam_open_mode(dummy, argv[optind], NULL) < 0)
- fprintf(pysamerr, "[bam_sort] Note the <out.prefix> argument has been replaced by -T/-o options\n");
+ fprintf(pysam_stderr, "[bam_sort] Note the <out.prefix> argument has been replaced by -T/-o options\n");
ret = EXIT_FAILURE;
}
/* bam_split.c -- split subcommand.
- Copyright (C) 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: Martin Pollard <mp15@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/sam.h>
#include <string.h>
#include <stdio.h>
bam_hdr_t* unaccounted_header;
size_t output_count;
char** rg_id;
+ char **rg_output_file_name;
samFile** rg_output_file;
bam_hdr_t** rg_output_header;
kh_c2i_t* rg_hash;
typedef struct state state_t;
-static int cleanup_state(state_t* status);
+static int cleanup_state(state_t* status, bool check_close);
static void cleanup_opts(parsed_opts_t* opts);
static void usage(FILE *write_to)
if (retval->merged_input_header == NULL) {
fprintf(stderr, "Could not read header for file '%s'\n",
opts->merged_input_name);
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
fprintf(stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
if (retval->unaccounted_header == NULL) {
fprintf(stderr, "Could not read header for file '%s'\n",
opts->unaccounted_header_name);
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
sam_close(hdr_load);
retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
fprintf(stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
}
if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count);
+ retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *));
retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*));
retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
retval->rg_hash = kh_init_c2i();
- if (!retval->rg_output_file || !retval->rg_output_header) {
+ if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) {
fprintf(stderr, "Could not allocate memory for output file array. Out of memory?");
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
if (!input_base_name) {
fprintf(stderr, "Out of memory\n");
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
char* extension = strrchr(input_base_name, '.');
&opts->ga.out);
if ( output_filename == NULL ) {
- fprintf(stderr, "Error expanding output filename format string.\r\n");
- cleanup_state(retval);
+ fprintf(stderr, "Error expanding output filename format string.\n");
+ cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
+ retval->rg_output_file_name[i] = output_filename;
retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
- fprintf(stderr, "Could not open output file: %s\r\n", output_filename);
- cleanup_state(retval);
+ fprintf(stderr, "Could not open output file: %s\n", output_filename);
+ cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
// Set and edit header
retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
- fprintf(stderr, "Could not rewrite header for file: %s\r\n", output_filename);
- cleanup_state(retval);
- free(output_filename);
+ fprintf(stderr, "Could not rewrite header for file: %s\n", output_filename);
+ cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
- free(output_filename);
}
free(input_base_name);
size_t i;
for (i = 0; i < state->output_count; i++) {
if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
- fprintf(stderr, "Could not write output file header\n");
+ fprintf(stderr, "Could not write output file header for '%s'\n",
+ state->rg_output_file_name[i]);
return false;
}
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(stderr, "Could not write read sequence\n");
+ fprintf(stderr, "Could not read first input record\n");
return false;
}
}
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
- fprintf(stderr, "Could not write sequence\n");
+ fprintf(stderr, "Could not write to output file '%s'\n",
+ state->rg_output_file_name[i]);
+ bam_destroy1(file_read);
return false;
}
} else {
return false;
} else {
if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
- fprintf(stderr, "Could not write sequence\n");
+ fprintf(stderr, "Could not write to unaccounted output file\n");
+ bam_destroy1(file_read);
return false;
}
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(stderr, "Could not write read sequence\n");
+ fprintf(stderr, "Could not read input record\n");
return false;
}
}
return true;
}
-static int cleanup_state(state_t* status)
+static int cleanup_state(state_t* status, bool check_close)
{
int ret = 0;
if (!status) return 0;
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
- if (status->unaccounted_file) ret |= sam_close(status->unaccounted_file);
+ if (status->unaccounted_file) {
+ if (sam_close(status->unaccounted_file) < 0 && check_close) {
+ fprintf(stderr, "Error on closing unaccounted file\n");
+ ret = -1;
+ }
+ }
sam_close(status->merged_input_file);
size_t i;
for (i = 0; i < status->output_count; i++) {
- bam_hdr_destroy(status->rg_output_header[i]);
- ret |= sam_close(status->rg_output_file[i]);
- free(status->rg_id[i]);
+ if (status->rg_output_header && status->rg_output_header[i])
+ bam_hdr_destroy(status->rg_output_header[i]);
+ if (status->rg_output_file && status->rg_output_file[i]) {
+ if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
+ fprintf(stderr, "Error on closing output file '%s'\n",
+ status->rg_output_file_name[i]);
+ ret = -1;
+ }
+ }
+ if (status->rg_id) free(status->rg_id[i]);
+ if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
}
- bam_hdr_destroy(status->merged_input_header);
+ if (status->merged_input_header)
+ bam_hdr_destroy(status->merged_input_header);
free(status->rg_output_header);
free(status->rg_output_file);
+ free(status->rg_output_file_name);
kh_destroy_c2i(status->rg_hash);
free(status->rg_id);
free(status);
{
int ret = 1;
parsed_opts_t* opts = parse_args(argc, argv);
- if (!opts ) goto cleanup_opts;
+ if (!opts) goto cleanup_opts;
state_t* status = init(opts);
if (!status) goto cleanup_opts;
- if (split(status)) ret = 0;
+ if (!split(status)) {
+ cleanup_state(status, false);
+ goto cleanup_opts;
+ }
+
+ ret = cleanup_state(status, true);
- ret |= (cleanup_state(status) != 0);
cleanup_opts:
cleanup_opts(opts);
/* bam_split.c -- split subcommand.
- Copyright (C) 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: Martin Pollard <mp15@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <htslib/sam.h>
#include <string.h>
#include <stdio.h>
bam_hdr_t* unaccounted_header;
size_t output_count;
char** rg_id;
+ char **rg_output_file_name;
samFile** rg_output_file;
bam_hdr_t** rg_output_header;
kh_c2i_t* rg_hash;
typedef struct state state_t;
-static int cleanup_state(state_t* status);
+static int cleanup_state(state_t* status, bool check_close);
static void cleanup_opts(parsed_opts_t* opts);
static void usage(FILE *write_to)
// Takes the command line options and turns them into something we can understand
static parsed_opts_t* parse_args(int argc, char** argv)
{
- if (argc == 1) { usage(stdout); return NULL; }
+ if (argc == 1) { usage(pysam_stdout); return NULL; }
const char* optstring = "vf:u:";
char* delim;
if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
/* else fall-through */
case '?':
- usage(stdout);
+ usage(pysam_stdout);
free(retval);
return NULL;
}
argv += optind;
if (argc != 1) {
- fprintf(pysamerr, "Invalid number of arguments: %d\n", argc);
- usage(pysamerr);
+ fprintf(pysam_stderr, "Invalid number of arguments: %d\n", argc);
+ usage(pysam_stderr);
free(retval);
return NULL;
}
kputs("bam", &str);
break;
case '\0':
- // Error is: fprintf(pysamerr, "bad format string, trailing %%\n");
+ // Error is: fprintf(pysam_stderr, "bad format string, trailing %%\n");
free(str.s);
return NULL;
default:
- // Error is: fprintf(pysamerr, "bad format string, unknown format specifier\n");
+ // Error is: fprintf(pysam_stderr, "bad format string, unknown format specifier\n");
free(str.s);
return NULL;
}
{
state_t* retval = calloc(sizeof(state_t), 1);
if (!retval) {
- fprintf(pysamerr, "Out of memory");
+ fprintf(pysam_stderr, "Out of memory");
return NULL;
}
retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
- fprintf(pysamerr, "Could not open input file (%s)\n", opts->merged_input_name);
+ fprintf(pysam_stderr, "Could not open input file (%s)\n", opts->merged_input_name);
free(retval);
return NULL;
}
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
if (retval->merged_input_header == NULL) {
- fprintf(pysamerr, "Could not read header for file '%s'\n",
+ fprintf(pysam_stderr, "Could not read header for file '%s'\n",
opts->merged_input_name);
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
if (opts->unaccounted_header_name) {
samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
- fprintf(pysamerr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
- cleanup_state(retval);
+ fprintf(pysam_stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
+ cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
if (retval->unaccounted_header == NULL) {
- fprintf(pysamerr, "Could not read header for file '%s'\n",
+ fprintf(pysam_stderr, "Could not read header for file '%s'\n",
opts->unaccounted_header_name);
- cleanup_state(retval);
+ cleanup_state(retval, false);
return NULL;
}
sam_close(hdr_load);
retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
- fprintf(pysamerr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
- cleanup_state(retval);
+ fprintf(pysam_stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
+ cleanup_state(retval, false);
return NULL;
}
}
// Open output files for RGs
if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
- if (opts->verbose) fprintf(pysamerr, "@RG's found %zu\n",retval->output_count);
+ if (opts->verbose) fprintf(pysam_stderr, "@RG's found %zu\n",retval->output_count);
+ retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *));
retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*));
retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
retval->rg_hash = kh_init_c2i();
- if (!retval->rg_output_file || !retval->rg_output_header) {
- fprintf(pysamerr, "Could not allocate memory for output file array. Out of memory?");
- cleanup_state(retval);
+ if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) {
+ fprintf(pysam_stderr, "Could not allocate memory for output file array. Out of memory?");
+ cleanup_state(retval, false);
return NULL;
}
char* dirsep = strrchr(opts->merged_input_name, '/');
char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
if (!input_base_name) {
- fprintf(pysamerr, "Out of memory\n");
- cleanup_state(retval);
+ fprintf(pysam_stderr, "Out of memory\n");
+ cleanup_state(retval, false);
return NULL;
}
char* extension = strrchr(input_base_name, '.');
&opts->ga.out);
if ( output_filename == NULL ) {
- fprintf(pysamerr, "Error expanding output filename format string.\r\n");
- cleanup_state(retval);
+ fprintf(pysam_stderr, "Error expanding output filename format string.\n");
+ cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
+ retval->rg_output_file_name[i] = output_filename;
retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
- fprintf(pysamerr, "Could not open output file: %s\r\n", output_filename);
- cleanup_state(retval);
+ fprintf(pysam_stderr, "Could not open output file: %s\n", output_filename);
+ cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
// Set and edit header
retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
- fprintf(pysamerr, "Could not rewrite header for file: %s\r\n", output_filename);
- cleanup_state(retval);
- free(output_filename);
+ fprintf(pysam_stderr, "Could not rewrite header for file: %s\n", output_filename);
+ cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
- free(output_filename);
}
free(input_base_name);
static bool split(state_t* state)
{
if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
- fprintf(pysamerr, "Could not write output file header\n");
+ fprintf(pysam_stderr, "Could not write output file header\n");
return false;
}
size_t i;
for (i = 0; i < state->output_count; i++) {
if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
- fprintf(pysamerr, "Could not write output file header\n");
+ fprintf(pysam_stderr, "Could not write output file header for '%s'\n",
+ state->rg_output_file_name[i]);
return false;
}
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(pysamerr, "Could not write read sequence\n");
+ fprintf(pysam_stderr, "Could not read first input record\n");
return false;
}
}
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
- fprintf(pysamerr, "Could not write sequence\n");
+ fprintf(pysam_stderr, "Could not write to output file '%s'\n",
+ state->rg_output_file_name[i]);
+ bam_destroy1(file_read);
return false;
}
} else {
// otherwise write to the unaccounted bam if there is one or fail
if (state->unaccounted_file == NULL) {
if (tag) {
- fprintf(pysamerr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
+ fprintf(pysam_stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
} else {
- fprintf(pysamerr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
+ fprintf(pysam_stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
}
bam_destroy1(file_read);
return false;
} else {
if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
- fprintf(pysamerr, "Could not write sequence\n");
+ fprintf(pysam_stderr, "Could not write to unaccounted output file\n");
+ bam_destroy1(file_read);
return false;
}
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(pysamerr, "Could not write read sequence\n");
+ fprintf(pysam_stderr, "Could not read input record\n");
return false;
}
}
return true;
}
-static int cleanup_state(state_t* status)
+static int cleanup_state(state_t* status, bool check_close)
{
int ret = 0;
if (!status) return 0;
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
- if (status->unaccounted_file) ret |= sam_close(status->unaccounted_file);
+ if (status->unaccounted_file) {
+ if (sam_close(status->unaccounted_file) < 0 && check_close) {
+ fprintf(pysam_stderr, "Error on closing unaccounted file\n");
+ ret = -1;
+ }
+ }
sam_close(status->merged_input_file);
size_t i;
for (i = 0; i < status->output_count; i++) {
- bam_hdr_destroy(status->rg_output_header[i]);
- ret |= sam_close(status->rg_output_file[i]);
- free(status->rg_id[i]);
+ if (status->rg_output_header && status->rg_output_header[i])
+ bam_hdr_destroy(status->rg_output_header[i]);
+ if (status->rg_output_file && status->rg_output_file[i]) {
+ if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
+ fprintf(pysam_stderr, "Error on closing output file '%s'\n",
+ status->rg_output_file_name[i]);
+ ret = -1;
+ }
+ }
+ if (status->rg_id) free(status->rg_id[i]);
+ if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
}
- bam_hdr_destroy(status->merged_input_header);
+ if (status->merged_input_header)
+ bam_hdr_destroy(status->merged_input_header);
free(status->rg_output_header);
free(status->rg_output_file);
+ free(status->rg_output_file_name);
kh_destroy_c2i(status->rg_hash);
free(status->rg_id);
free(status);
{
int ret = 1;
parsed_opts_t* opts = parse_args(argc, argv);
- if (!opts ) goto cleanup_opts;
+ if (!opts) goto cleanup_opts;
state_t* status = init(opts);
if (!status) goto cleanup_opts;
- if (split(status)) ret = 0;
+ if (!split(status)) {
+ cleanup_state(status, false);
+ goto cleanup_opts;
+ }
+
+ ret = cleanup_state(status, true);
- ret |= (cleanup_state(status) != 0);
cleanup_opts:
cleanup_opts(opts);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
flagstat_loop(s, c);
bam_destroy1(b);
if (ret != -1)
- fprintf(pysamerr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+ fprintf(pysam_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
return s;
}
switch (c) {
case INPUT_FMT_OPTION:
if (hts_opt_add(&in_opts, optarg) < 0)
- usage_exit(pysamerr, EXIT_FAILURE);
+ usage_exit(pysam_stderr, EXIT_FAILURE);
break;
default:
- usage_exit(pysamerr, EXIT_FAILURE);
+ usage_exit(pysam_stderr, EXIT_FAILURE);
}
}
if (argc != optind+1) {
- if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
- else usage_exit(pysamerr, EXIT_FAILURE);
+ if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS);
+ else usage_exit(pysam_stderr, EXIT_FAILURE);
}
fp = sam_open(argv[optind], "r");
if (fp == NULL) {
return 1;
}
if (hts_opt_apply(fp, in_opts)) {
- fprintf(pysamerr, "Failed to apply input-fmt-options\n");
+ fprintf(pysam_stderr, "Failed to apply input-fmt-options\n");
return 1;
}
if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
return 1;
}
if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
return 1;
}
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysamerr, "Failed to read header for \"%s\"\n", argv[optind]);
+ fprintf(pysam_stderr, "Failed to read header for \"%s\"\n", argv[optind]);
return 1;
}
s = bam_flagstat_core(fp, header);
- printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
- printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
- printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
- printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
- printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
- printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
- printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
- printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
- printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1]));
- printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
- printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1]));
- printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
- printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
+ fprintf(pysam_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ fprintf(pysam_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
+ fprintf(pysam_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
+ fprintf(pysam_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+ fprintf(pysam_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+ fprintf(pysam_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
+ fprintf(pysam_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
+ fprintf(pysam_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
+ fprintf(pysam_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1]));
+ fprintf(pysam_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
+ fprintf(pysam_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1]));
+ fprintf(pysam_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
+ fprintf(pysam_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
free(s);
bam_hdr_destroy(header);
sam_close(fp);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <regex.h>
#include <assert.h>
#include "bam_tview.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <regex.h>
#include <assert.h>
#include "bam_tview.h"
tv->fp = sam_open_format(fn, "r", fmt);
if(tv->fp == NULL)
{
- fprintf(pysamerr,"sam_open %s. %s\n", fn,fn_fa);
+ fprintf(pysam_stderr,"sam_open %s. %s\n", fn,fn_fa);
exit(EXIT_FAILURE);
}
// TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
tv->header = sam_hdr_read(tv->fp);
if(tv->header == NULL)
{
- fprintf(pysamerr,"Cannot read '%s'.\n", fn);
+ fprintf(pysam_stderr,"Cannot read '%s'.\n", fn);
exit(EXIT_FAILURE);
}
tv->idx = sam_index_load(tv->fp, fn);
if (tv->idx == NULL)
{
- fprintf(pysamerr,"Cannot read index for '%s'.\n", fn);
+ fprintf(pysam_stderr,"Cannot read index for '%s'.\n", fn);
exit(EXIT_FAILURE);
}
tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
free(str);
if ( !tv->ref )
{
- fprintf(pysamerr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
+ fprintf(pysam_stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
exit(1);
}
}
{
if ( !format )
{
- fprintf(pysamerr,
+ fprintf(pysam_stderr,
"Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
"Options:\n"
" -d display output as (H)tml or (C)urses or (T)ext \n"
" -p chr:pos go directly to this position\n"
" -s STR display only reads from this sample or group\n");
- sam_global_opt_help(pysamerr, "-.--.");
+ sam_global_opt_help(pysam_stderr, "-.--.");
}
else
{
va_list ap;
va_start(ap, format);
- vfprintf(pysamerr, format, ap);
+ vfprintf(pysam_stderr, format, ap);
va_end(ap);
}
exit(-1);
}
if ( i==tv->header->n_targets )
{
- fprintf(pysamerr,"None of the BAM sequence names present in the fasta file\n");
+ fprintf(pysam_stderr,"None of the BAM sequence names present in the fasta file\n");
exit(EXIT_FAILURE);
}
tv->curr_tid = i;
tview_t* base=(tview_t*)tv;
if(tv==0)
{
- fprintf(pysamerr,"Calloc failed\n");
+ fprintf(pysam_stderr,"Calloc failed\n");
return 0;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include "bam_tview.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include "bam_tview.h"
fprintf(ptr->out,"<span");
while(css<32)
{
- //if(y>1) fprintf(pysamerr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
+ //if(y>1) fprintf(pysam_stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
{
tview_t* base=(tview_t*)tv;
if(tv==0)
{
- fprintf(pysamerr,"Calloc failed\n");
+ fprintf(pysam_stderr,"Calloc failed\n");
return 0;
}
tv->row_count=0;
tv->screen=NULL;
- tv->out=stdout;
+ tv->out=pysam_stdout;
tv->attributes=0;
base_tv_init(base,fn,fn_fa,samples,fmt);
/* initialize callbacks */
/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013, 2015 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
int is_stdout, sam_global_args *ga)
{
- samFile *fp, *fpw, **fpt;
- char **fnt, modew[8];
- bam1_t *b;
- int i, l;
- bam_hdr_t *h;
- int64_t *cnt;
+ samFile *fp, *fpw = NULL, **fpt = NULL;
+ char **fnt = NULL, modew[8];
+ bam1_t *b = NULL;
+ int i, l, r;
+ bam_hdr_t *h = NULL;
+ int64_t j, max_cnt = 0, *cnt = NULL;
+ elem_t *a = NULL;
- // split
+ // Read input, distribute reads pseudo-randomly into n_files temporary
+ // files.
fp = sam_open_format(fn, "r", &ga->in);
if (fp == NULL) {
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
h = sam_hdr_read(fp);
if (h == NULL) {
fprintf(stderr, "Couldn't read header for '%s'\n", fn);
- return 1;
+ goto fail;
}
fnt = (char**)calloc(n_files, sizeof(char*));
+ if (!fnt) goto mem_fail;
fpt = (samFile**)calloc(n_files, sizeof(samFile*));
+ if (!fpt) goto mem_fail;
cnt = (int64_t*)calloc(n_files, 8);
+ if (!cnt) goto mem_fail;
+
l = strlen(pre);
for (i = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 10, 1);
+ if (!fnt[i]) goto mem_fail;
sprintf(fnt[i], "%s.%.4d.bam", pre, i);
fpt[i] = sam_open(fnt[i], "wb1");
if (fpt[i] == NULL) {
print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]);
- return 1;
+ goto fail;
+ }
+ if (sam_hdr_write(fpt[i], h) < 0) {
+ print_error_errno("collate", "Couldn't write header to intermediate file \"%s\"", fnt[i]);
+ goto fail;
}
- sam_hdr_write(fpt[i], h);
}
b = bam_init1();
- while (sam_read1(fp, h, b) >= 0) {
+ if (!b) goto mem_fail;
+ while ((r = sam_read1(fp, h, b)) >= 0) {
uint32_t x;
x = hash_X31_Wang(bam_get_qname(b)) % n_files;
- sam_write1(fpt[x], h, b);
+ if (sam_write1(fpt[x], h, b) < 0) {
+ print_error_errno("collate", "Couldn't write to intermediate file \"%s\"", fnt[x]);
+ goto fail;
+ }
++cnt[x];
}
bam_destroy1(b);
- for (i = 0; i < n_files; ++i) sam_close(fpt[i]);
+ b = NULL;
+ if (r < -1) {
+ fprintf(stderr, "Error reading input file\n");
+ goto fail;
+ }
+ for (i = 0; i < n_files; ++i) {
+ // Close split output
+ r = sam_close(fpt[i]);
+ fpt[i] = NULL;
+ if (r < 0) {
+ fprintf(stderr, "Error on closing '%s'\n", fnt[i]);
+ return 1;
+ }
+
+ // Find biggest count
+ if (max_cnt < cnt[i]) max_cnt = cnt[i];
+ }
free(fpt);
+ fpt = NULL;
sam_close(fp);
-
+ fp = NULL;
// merge
sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
if (!is_stdout) { // output to a file
char *fnw = (char*)calloc(l + 5, 1);
+ if (!fnw) goto mem_fail;
if (ga->out.format == unknown_format)
sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default
else
if (fpw == NULL) {
if (is_stdout) print_error_errno("collate", "Cannot open standard output");
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
- return 1;
+ goto fail;
+ }
+
+ if (sam_hdr_write(fpw, h) < 0) {
+ print_error_errno("collate", "Couldn't write header");
+ goto fail;
+ }
+
+ a = malloc(max_cnt * sizeof(elem_t));
+ if (!a) goto mem_fail;
+ for (j = 0; j < max_cnt; ++j) {
+ a[j].b = bam_init1();
+ if (!a[j].b) { max_cnt = j; goto mem_fail; }
}
- sam_hdr_write(fpw, h);
for (i = 0; i < n_files; ++i) {
- int64_t j, c = cnt[i];
- elem_t *a;
+ int64_t c = cnt[i];
fp = sam_open_format(fnt[i], "r", &ga->in);
- bam_hdr_destroy(sam_hdr_read(fp));
- a = (elem_t*)calloc(c, sizeof(elem_t));
+ if (NULL == fp) {
+ print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]);
+ goto fail;
+ }
+ bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header
+
+ // Slurp in one of the split files
for (j = 0; j < c; ++j) {
- a[j].b = bam_init1();
- sam_read1(fp, h, a[j].b);
+ if (sam_read1(fp, h, a[j].b) < 0) {
+ fprintf(stderr, "Error reading '%s'\n", fnt[i]);
+ goto fail;
+ }
a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
}
sam_close(fp);
unlink(fnt[i]);
free(fnt[i]);
- ks_introsort(bamshuf, c, a);
+ fnt[i] = NULL;
+
+ ks_introsort(bamshuf, c, a); // Shuffle all the reads
+
+ // Write them out again
for (j = 0; j < c; ++j) {
- sam_write1(fpw, h, a[j].b);
- bam_destroy1(a[j].b);
+ if (sam_write1(fpw, h, a[j].b) < 0) {
+ print_error_errno("collate", "Error writing to output");
+ goto fail;
+ }
}
- free(a);
}
- sam_close(fpw);
+
bam_hdr_destroy(h);
- free(fnt); free(cnt);
+ for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b);
+ free(a); free(fnt); free(cnt);
sam_global_args_free(ga);
+ if (sam_close(fpw) < 0) {
+ fprintf(stderr, "Error on closing output\n");
+ return 1;
+ }
return 0;
+
+ mem_fail:
+ fprintf(stderr, "Out of memory\n");
+
+ fail:
+ if (fp) sam_close(fp);
+ if (fpw) sam_close(fpw);
+ if (h) bam_hdr_destroy(h);
+ if (b) bam_destroy1(b);
+ for (i = 0; i < n_files; ++i) {
+ if (fnt) free(fnt[i]);
+ if (fpt && fpt[i]) sam_close(fpt[i]);
+ }
+ if (a) {
+ for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b);
+ free(a);
+ }
+ free(fnt);
+ free(fpt);
+ free(cnt);
+ sam_global_args_free(ga);
+ return 1;
}
static int usage(FILE *fp, int n_files) {
/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013, 2015 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
KSORT_INIT(bamshuf, elem_t, elem_lt)
static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
- int is_stdout, sam_global_args *ga)
+ int is_pysam_stdout, sam_global_args *ga)
{
- samFile *fp, *fpw, **fpt;
- char **fnt, modew[8];
- bam1_t *b;
- int i, l;
- bam_hdr_t *h;
- int64_t *cnt;
+ samFile *fp, *fpw = NULL, **fpt = NULL;
+ char **fnt = NULL, modew[8];
+ bam1_t *b = NULL;
+ int i, l, r;
+ bam_hdr_t *h = NULL;
+ int64_t j, max_cnt = 0, *cnt = NULL;
+ elem_t *a = NULL;
- // split
+ // Read input, distribute reads pseudo-randomly into n_files temporary
+ // files.
fp = sam_open_format(fn, "r", &ga->in);
if (fp == NULL) {
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
h = sam_hdr_read(fp);
if (h == NULL) {
- fprintf(pysamerr, "Couldn't read header for '%s'\n", fn);
- return 1;
+ fprintf(pysam_stderr, "Couldn't read header for '%s'\n", fn);
+ goto fail;
}
fnt = (char**)calloc(n_files, sizeof(char*));
+ if (!fnt) goto mem_fail;
fpt = (samFile**)calloc(n_files, sizeof(samFile*));
+ if (!fpt) goto mem_fail;
cnt = (int64_t*)calloc(n_files, 8);
+ if (!cnt) goto mem_fail;
+
l = strlen(pre);
for (i = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 10, 1);
+ if (!fnt[i]) goto mem_fail;
sprintf(fnt[i], "%s.%.4d.bam", pre, i);
fpt[i] = sam_open(fnt[i], "wb1");
if (fpt[i] == NULL) {
print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]);
- return 1;
+ goto fail;
+ }
+ if (sam_hdr_write(fpt[i], h) < 0) {
+ print_error_errno("collate", "Couldn't write header to intermediate file \"%s\"", fnt[i]);
+ goto fail;
}
- sam_hdr_write(fpt[i], h);
}
b = bam_init1();
- while (sam_read1(fp, h, b) >= 0) {
+ if (!b) goto mem_fail;
+ while ((r = sam_read1(fp, h, b)) >= 0) {
uint32_t x;
x = hash_X31_Wang(bam_get_qname(b)) % n_files;
- sam_write1(fpt[x], h, b);
+ if (sam_write1(fpt[x], h, b) < 0) {
+ print_error_errno("collate", "Couldn't write to intermediate file \"%s\"", fnt[x]);
+ goto fail;
+ }
++cnt[x];
}
bam_destroy1(b);
- for (i = 0; i < n_files; ++i) sam_close(fpt[i]);
+ b = NULL;
+ if (r < -1) {
+ fprintf(pysam_stderr, "Error reading input file\n");
+ goto fail;
+ }
+ for (i = 0; i < n_files; ++i) {
+ // Close split output
+ r = sam_close(fpt[i]);
+ fpt[i] = NULL;
+ if (r < 0) {
+ fprintf(pysam_stderr, "Error on closing '%s'\n", fnt[i]);
+ return 1;
+ }
+
+ // Find biggest count
+ if (max_cnt < cnt[i]) max_cnt = cnt[i];
+ }
free(fpt);
+ fpt = NULL;
sam_close(fp);
-
+ fp = NULL;
// merge
sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
- if (!is_stdout) { // output to a file
+ if (!is_pysam_stdout) { // output to a file
char *fnw = (char*)calloc(l + 5, 1);
+ if (!fnw) goto mem_fail;
if (ga->out.format == unknown_format)
sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default
else
sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out));
fpw = sam_open_format(fnw, modew, &ga->out);
free(fnw);
- } else fpw = sam_open_format("-", modew, &ga->out); // output to stdout
+ } else fpw = sam_open_format("-", modew, &ga->out); // output to pysam_stdout
if (fpw == NULL) {
- if (is_stdout) print_error_errno("collate", "Cannot open standard output");
+ if (is_pysam_stdout) print_error_errno("collate", "Cannot open standard output");
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
- return 1;
+ goto fail;
+ }
+
+ if (sam_hdr_write(fpw, h) < 0) {
+ print_error_errno("collate", "Couldn't write header");
+ goto fail;
+ }
+
+ a = malloc(max_cnt * sizeof(elem_t));
+ if (!a) goto mem_fail;
+ for (j = 0; j < max_cnt; ++j) {
+ a[j].b = bam_init1();
+ if (!a[j].b) { max_cnt = j; goto mem_fail; }
}
- sam_hdr_write(fpw, h);
for (i = 0; i < n_files; ++i) {
- int64_t j, c = cnt[i];
- elem_t *a;
+ int64_t c = cnt[i];
fp = sam_open_format(fnt[i], "r", &ga->in);
- bam_hdr_destroy(sam_hdr_read(fp));
- a = (elem_t*)calloc(c, sizeof(elem_t));
+ if (NULL == fp) {
+ print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]);
+ goto fail;
+ }
+ bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header
+
+ // Slurp in one of the split files
for (j = 0; j < c; ++j) {
- a[j].b = bam_init1();
- sam_read1(fp, h, a[j].b);
+ if (sam_read1(fp, h, a[j].b) < 0) {
+ fprintf(pysam_stderr, "Error reading '%s'\n", fnt[i]);
+ goto fail;
+ }
a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
}
sam_close(fp);
unlink(fnt[i]);
free(fnt[i]);
- ks_introsort(bamshuf, c, a);
+ fnt[i] = NULL;
+
+ ks_introsort(bamshuf, c, a); // Shuffle all the reads
+
+ // Write them out again
for (j = 0; j < c; ++j) {
- sam_write1(fpw, h, a[j].b);
- bam_destroy1(a[j].b);
+ if (sam_write1(fpw, h, a[j].b) < 0) {
+ print_error_errno("collate", "Error writing to output");
+ goto fail;
+ }
}
- free(a);
}
- sam_close(fpw);
+
bam_hdr_destroy(h);
- free(fnt); free(cnt);
+ for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b);
+ free(a); free(fnt); free(cnt);
sam_global_args_free(ga);
+ if (sam_close(fpw) < 0) {
+ fprintf(pysam_stderr, "Error on closing output\n");
+ return 1;
+ }
return 0;
+
+ mem_fail:
+ fprintf(pysam_stderr, "Out of memory\n");
+
+ fail:
+ if (fp) sam_close(fp);
+ if (fpw) sam_close(fpw);
+ if (h) bam_hdr_destroy(h);
+ if (b) bam_destroy1(b);
+ for (i = 0; i < n_files; ++i) {
+ if (fnt) free(fnt[i]);
+ if (fpt && fpt[i]) sam_close(fpt[i]);
+ }
+ if (a) {
+ for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b);
+ free(a);
+ }
+ free(fnt);
+ free(fpt);
+ free(cnt);
+ sam_global_args_free(ga);
+ return 1;
}
static int usage(FILE *fp, int n_files) {
fprintf(fp,
"Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
"Options:\n"
- " -O output to stdout\n"
+ " -O output to pysam_stdout\n"
" -u uncompressed BAM output\n"
" -l INT compression level [%d]\n" // DEF_CLEVEL
" -n INT number of temporary files [%d]\n", // n_files
int main_bamshuf(int argc, char *argv[])
{
- int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
+ int c, n_files = 64, clevel = DEF_CLEVEL, is_pysam_stdout = 0, is_un = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'u': is_un = 1; break;
- case 'O': is_stdout = 1; break;
+ case 'O': is_pysam_stdout = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
- case '?': return usage(pysamerr, n_files);
+ case '?': return usage(pysam_stderr, n_files);
}
}
if (is_un) clevel = 0;
if (optind + 2 > argc)
- return usage(pysamerr, n_files);
+ return usage(pysam_stderr, n_files);
- return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout, &ga);
+ return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_pysam_stdout, &ga);
}
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2015 Genome Research Ltd.
+ Copyright (C) 2008-2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
printf(
"samtools %s\n"
"Using htslib %s\n"
-"Copyright (C) 2015 Genome Research Ltd.\n",
+"Copyright (C) 2016 Genome Research Ltd.\n",
samtools_version(), hts_version());
}
else if (strcmp(argv[1], "--version-only") == 0) {
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2015 Genome Research Ltd.
+ Copyright (C) 2008-2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
{
- fflush(stdout);
- if (subcommand && *subcommand) fprintf(pysamerr, "samtools %s: ", subcommand);
- else fprintf(pysamerr, "samtools: ");
- vfprintf(pysamerr, format, args);
- if (extra) fprintf(pysamerr, ": %s\n", extra);
- else fprintf(pysamerr, "\n");
- fflush(pysamerr);
+ fflush(pysam_stdout);
+ if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand);
+ else fprintf(pysam_stderr, "samtools: ");
+ vfprintf(pysam_stderr, format, args);
+ if (extra) fprintf(pysam_stderr, ": %s\n", extra);
+ else fprintf(pysam_stderr, "\n");
+ fflush(pysam_stderr);
}
void print_error(const char *subcommand, const char *format, ...)
int samtools_main(int argc, char *argv[])
{
#ifdef _WIN32
- setmode(fileno(stdout), O_BINARY);
+ setmode(fileno(pysam_stdout), O_BINARY);
setmode(fileno(stdin), O_BINARY);
#endif
+ if (argc < 2) { usage(pysam_stderr); return 1; }
- if (argc < 2) { usage(pysamerr); return 1; }
-
if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) {
- if (argc == 2) { usage(stdout); return 0; }
+ if (argc == 2) { usage(pysam_stdout); return 0; }
// Otherwise change "samtools help COMMAND [...]" to "samtools COMMAND";
// main_xyz() functions by convention display the subcommand's usage
argv++;
argc = 2;
}
+
int ret = 0;
if (strcmp(argv[1], "view") == 0) ret = main_samview(argc-1, argv+1);
else if (strcmp(argv[1], "import") == 0) ret = main_import(argc-1, argv+1);
else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1);
else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1);
else if (strcmp(argv[1], "pileup") == 0) {
- fprintf(pysamerr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
+ fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
return 1;
}
else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
else if (strcmp(argv[1], "--version") == 0) {
- printf(
+ fprintf(pysam_stdout,
"samtools %s\n"
"Using htslib %s\n"
-"Copyright (C) 2015 Genome Research Ltd.\n",
+"Copyright (C) 2016 Genome Research Ltd.\n",
samtools_version(), hts_version());
}
else if (strcmp(argv[1], "--version-only") == 0) {
- printf("%s+htslib-%s\n", samtools_version(), hts_version());
+ fprintf(pysam_stdout, "%s+htslib-%s\n", samtools_version(), hts_version());
}
else {
- fprintf(pysamerr, "[main] unrecognized command '%s'\n", argv[1]);
+ fprintf(pysam_stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;
}
return ret;
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <zlib.h>
#include <stdio.h>
#include <ctype.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <zlib.h>
#include <stdio.h>
#include <ctype.h>
if (usage) break;
}
if (usage || optind + 2 > argc) {
- fprintf(pysamerr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
- fprintf(pysamerr, " -Q INT Only count bases of at least INT quality [0]\n");
- sam_global_opt_help(pysamerr, "-.--.");
+ fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
+ fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n");
+ sam_global_opt_help(pysam_stderr, "-.--.");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
if (aux[i]->fp)
idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
if (aux[i]->fp == 0 || idx[i] == 0) {
- fprintf(pysamerr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
+ fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
return 2;
}
// TODO bgzf_set_cache_size(aux[i]->fp, 20);
aux[i]->header = sam_hdr_read(aux[i]->fp);
if (aux[i]->header == NULL) {
- fprintf(pysamerr, "ERROR: failed to read header for '%s'\n",
+ fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n",
argv[i+optind+1]);
return 2;
}
kputc('\t', &str);
kputl(cnt[i], &str);
}
- puts(str.s);
+ fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout);
bam_mplp_destroy(mplp);
continue;
bed_error:
- fprintf(pysamerr, "Errors in BED line '%s'\n", str.s);
+ fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s);
}
free(n_plp); free(plp);
ks_destroy(ks);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
// has called their reference "browser" or "track".
if (0 == strcmp(ref, "browser")) continue;
if (0 == strcmp(ref, "track")) continue;
- fprintf(pysamerr, "[bed_read] Parse error reading %s at line %u\n",
+ fprintf(pysam_stderr, "[bed_read] Parse error reading %s at line %u\n",
fn, line);
goto fail_no_msg;
}
bed_index(h);
return h;
fail:
- fprintf(pysamerr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno));
+ fprintf(pysam_stderr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno));
fail_no_msg:
if (ks) ks_destroy(ks);
if (fp) gzclose(fp);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
if (s >= 0) {
int j;
- printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
+ fprintf(pysam_stdout, "%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
for (j = s; j < i; ++j) {
int c = cns[j]>>8;
- if (c == 0) putchar('N');
- else putchar("ACGT"[c&3]);
+ if (c == 0) fputc('N', pysam_stdout);
+ else fputc("ACGT"[c&3], pysam_stdout);
}
- putchar('\t');
+ fputc('\t', pysam_stdout);
for (j = s; j < i; ++j)
- putchar(33 + (cns[j]>>8>>2));
- putchar('\n');
+ fputc(33 + (cns[j]>>8>>2), pysam_stdout);
+ fputc('\n', pysam_stdout);
}
- //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
+ //if (s >= 0) fprintf(pysam_stdout, "%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
s = -1;
} else if ((b[i]>>2&3) && s < 0) s = i;
}
}
if (ga.reference) {
g.fai = fai_load(ga.reference);
- if (g.fai == 0) fprintf(pysamerr, "[%s] fail to load the fasta index.\n", __func__);
+ if (g.fai == 0) fprintf(pysam_stderr, "[%s] fail to load the fasta index.\n", __func__);
}
if (usage || argc == optind) {
- fprintf(pysamerr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
- sam_global_opt_help(pysamerr, "-.--f");
+ fprintf(pysam_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
+ sam_global_opt_help(pysam_stderr, "-.--f");
return 1;
}
l = max_l = 0; cns = 0;
g.fp = sam_open_format(argv[optind], "r", &ga.in);
g.h = sam_hdr_read(g.fp);
if (g.h == NULL) {
- fprintf(pysamerr, "Couldn't read header for '%s'\n", argv[optind]);
+ fprintf(pysam_stderr, "Couldn't read header for '%s'\n", argv[optind]);
sam_close(g.fp);
return 1;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
+#include <unistd.h>
#include <zlib.h>
#include <getopt.h>
#include "htslib/kseq.h"
char *fname = NULL;
if ( optind>=argc )
{
- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ if ( !isatty(STDIN_FILENO) ) fname = "-"; // reading from stdin
else return dict_usage();
}
else fname = argv[optind];
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
+#include <unistd.h>
#include <zlib.h>
#include <getopt.h>
#include "htslib/kseq.h"
fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (fp == 0) {
- fprintf(pysamerr, "dict: %s: No such file or directory\n", fn);
+ fprintf(pysam_stderr, "dict: %s: No such file or directory\n", fn);
exit(1);
}
- FILE *out = stdout;
+ FILE *out = pysam_stdout;
if (args->output_fname) {
out = fopen(args->output_fname, "w");
if (out == NULL) {
- fprintf(pysamerr, "dict: %s: Cannot open file for writing\n", args->output_fname);
+ fprintf(pysam_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
exit(1);
}
}
static int dict_usage(void)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "About: Create a sequence dictionary file from a fasta file\n");
- fprintf(pysamerr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
- fprintf(pysamerr, "Options: -a, --assembly STR assembly\n");
- fprintf(pysamerr, " -H, --no-header do not print @HD line\n");
- fprintf(pysamerr, " -o, --output STR file to write out dict file [stdout]\n");
- fprintf(pysamerr, " -s, --species STR species\n");
- fprintf(pysamerr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "About: Create a sequence dictionary file from a fasta file\n");
+ fprintf(pysam_stderr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
+ fprintf(pysam_stderr, "Options: -a, --assembly STR assembly\n");
+ fprintf(pysam_stderr, " -H, --no-header do not print @HD line\n");
+ fprintf(pysam_stderr, " -o, --output STR file to write out dict file [pysam_stdout]\n");
+ fprintf(pysam_stderr, " -s, --species STR species\n");
+ fprintf(pysam_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
+ fprintf(pysam_stderr, "\n");
return 1;
}
char *fname = NULL;
if ( optind>=argc )
{
- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ if ( !isatty(STDIN_FILENO) ) fname = "-"; // reading from stdin
else return dict_usage();
}
else fname = argv[optind];
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include "errmod.h"
#include "htslib/ksort.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <math.h>
#include "errmod.h"
#include "htslib/ksort.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
error(NULL);
if ( argc==2 )
{
- fai_build(argv[optind]);
+ if (fai_build(argv[optind]) != 0) {
+ error("Could not build fai index %s.fai\n", argv[optind]);
+ }
return 0;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
{
va_list ap;
va_start(ap, format);
- vfprintf(pysamerr, format, ap);
+ vfprintf(pysam_stderr, format, ap);
va_end(ap);
}
else
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
+ fprintf(pysam_stderr, "\n");
}
exit(-1);
}
error(NULL);
if ( argc==2 )
{
- fai_build(argv[optind]);
+ if (fai_build(argv[optind]) != 0) {
+ error("Could not build fai index %s.fai\n", argv[optind]);
+ }
return 0;
}
while ( ++optind<argc )
{
- printf(">%s\n", argv[optind]);
+ fprintf(pysam_stdout, ">%s\n", argv[optind]);
int i, j, seq_len;
char *seq = fai_fetch(fai, argv[optind], &seq_len);
if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]);
for (i=0; i<seq_len; i+=60)
{
for (j=0; j<60 && i+j<seq_len; j++)
- putchar(seq[i+j]);
- putchar('\n');
+ fputc(seq[i+j], pysam_stdout);
+ fputc('\n', pysam_stdout);
}
free(seq);
}
SOFTWARE.
*/
+#include <config.h>
+
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
SOFTWARE.
*/
+#include <config.h>
+
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
sum += fi[u] + fi[u+1] + fi[u+2];
-// fprintf(pysamerr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
+// fprintf(pysam_stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
}
// rescale
s[i] = sum;
bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
-// fprintf(pysamerr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
+// fprintf(pysam_stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
}
// rescale
set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
if (state) state[i-1] = max_k;
if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
#ifdef _MAIN
- fprintf(pysamerr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
+ fprintf(pysam_stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
"ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
#endif
}
#ifdef _MAIN
#include <unistd.h>
-int main(int argc, char *argv[])
+int samtools_kprobaln_main(int argc, char *argv[])
{
uint8_t conv[256], *iqual, *ref, *query;
int c, l_ref, l_query, i, q = 30, b = 10, P;
}
}
if (optind + 2 > argc) {
- fprintf(pysamerr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
+ fprintf(pysam_stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
return 1;
}
memset(conv, 4, 256);
memset(iqual, q, l_query);
kpa_par_def.bw = b;
P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
- fprintf(pysamerr, "%d\n", P);
+ fprintf(pysam_stderr, "%d\n", P);
free(iqual);
return 0;
}
SOFTWARE.
*/
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
SOFTWARE.
*/
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// a fatal error
static void fatal(const char *msg)
{
- fprintf(pysamerr, "E %s\n", msg);
+ fprintf(pysam_stderr, "E %s\n", msg);
exit(1);
}
// remove pads
dst->l = j;
}
-int main(int argc, char *argv[])
+int samtools_ace2sam_main(int argc, char *argv[])
{
gzFile fp;
kstream_t *ks;
}
}
if (argc == optind) {
- fprintf(pysamerr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
- fprintf(pysamerr, "Options: -p output padded SAM\n");
- fprintf(pysamerr, " -c write the contig sequence in SAM\n\n");
- fprintf(pysamerr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
- fprintf(pysamerr, " 2. The order of reads in AF and in RD must be identical\n");
- fprintf(pysamerr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
- fprintf(pysamerr, " 4. This program writes the headerless SAM to stdout and header to pysamerr\n\n");
+ fprintf(pysam_stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
+ fprintf(pysam_stderr, "Options: -p output padded SAM\n");
+ fprintf(pysam_stderr, " -c write the contig sequence in SAM\n\n");
+ fprintf(pysam_stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
+ fprintf(pysam_stderr, " 2. The order of reads in AF and in RD must be identical\n");
+ fprintf(pysam_stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
+ fprintf(pysam_stderr, " 4. This program writes the headerless SAM to pysam_stdout and header to pysam_stderr\n\n");
return 1;
}
if (t[1].s[i] != '*') ++k;
}
// write out the SAM header and contig sequences
- fprintf(pysamerr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line
+ fprintf(pysam_stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line
cns = &t[is_padded?1:2];
- fprintf(pysamerr, "S >%s\n", t[0].s);
+ fprintf(pysam_stderr, "S >%s\n", t[0].s);
for (i = 0; i < cns->l; i += LINE_LEN) {
- fputs("S ", pysamerr);
+ fputs("S ", pysam_stderr);
for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
- fputc(cns->s[i + k], pysamerr);
- fputc('\n', pysamerr);
+ fputc(cns->s[i + k], pysam_stderr);
+ fputc('\n', pysam_stderr);
}
#define __padded2cigar(sp) do { \
if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
for (i = 0; i < t[2].l; ++i) { // read the consensus quality
int q;
- if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(pysamerr, "E truncated contig quality\n");
+ if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(pysam_stderr, "E truncated contig quality\n");
if (s.l) {
q = atoi(s.s) + 33;
if (q > 126) q = 126;
}
if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
- if (write_cns) puts(t[4].s); t[4].l = 0;
+ if (write_cns) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); t[4].l = 0;
} else if (strcmp(s.s, "AF") == 0) { // padded read position
int reversed, neg, pos;
if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
if (write_cns) {
- if (t[4].l) puts(t[4].s);
+ if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout);
t[4].l = 0;
}
ks_getuntil(ks, 0, &s, &dret); // read name
kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
kputs("\t*", &t[4]); // QUAL
- puts(t[4].s); // print to stdout
+ fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); // print to pysam_stdout
++af_i;
} else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
}
/* padding.c -- depad subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2014, 2015 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <htslib/faidx.h>
#include "sam_header.h"
#include "sam_opts.h"
+#include "samtools.h"
#define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
b = bam_init1();
+ if (!b) {
+ fprintf(stderr, "[depad] Couldn't allocate bam struct\n");
+ return -1;
+ }
r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
int read_ret;
while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in'
b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b));
next_seq:
- sam_write1(out, h, b);
+ if (sam_write1(out, h, b) < 0) {
+ print_error_errno("depad", "error writing to output");
+ return -1;
+ }
}
if (read_ret < -1) {
fprintf(stderr, "[depad] truncated file.\n");
}
// open file handlers
if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
- fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
+ print_error_errno("depad", "failed to open \"%s\" for reading", argv[optind]);
ret = 1;
goto depad_end;
}
char wmode[2];
strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b");
if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) {
- fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+ print_error_errno("depad", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output");
ret = 1;
goto depad_end;
}
}
// Do the depad
- ret = bam_pad2unpad(in, out, h, fai);
+ if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1;
depad_end:
// close files, free and return
if (fai) fai_destroy(fai);
if (h) bam_hdr_destroy(h);
- sam_close(in);
- sam_close(out);
+ if (in) sam_close(in);
+ if (out && sam_close(out) < 0) {
+ fprintf(stderr, "[depad] error on closing output file.\n");
+ ret = 1;
+ }
free(fn_list); free(fn_out);
return ret;
}
sam_global_opt_help(stderr, "-...-");
if (is_long_help)
- fprintf(stderr, "Notes:\n\
-\n\
- 1. Requires embedded reference sequences (before the reads for that reference),\n\
- or ideally a FASTA file of the padded reference sequences (via the -T argument).\n\
-\n\
- 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\
-\n");
+ fprintf(stderr,
+"Notes:\n"
+"\n"
+"1. Requires embedded reference sequences (before the reads for that reference),\n"
+" or ideally a FASTA file of the padded reference sequences (via a -T option).\n"
+"\n"
+"2. Input padded alignment reads' CIGAR strings must not use P or I operators.\n"
+"\n");
return 1;
}
/* padding.c -- depad subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2014, 2015 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <htslib/faidx.h>
#include "sam_header.h"
#include "sam_opts.h"
+#include "samtools.h"
#define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
if (0 == cigar_n_warning) {
cigar_n_warning = -1;
- fprintf(pysamerr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
+ fprintf(pysam_stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
}
} else {
- fprintf(pysamerr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
+ fprintf(pysam_stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
return -1;
}
}
fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
if (fai_ref_len != ref_len) {
- fprintf(pysamerr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
+ fprintf(pysam_stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
free(fai_ref);
return -1;
}
} else {
int i = seq_nt16_table[(int)base];
if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
- fprintf(pysamerr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
+ fprintf(pysam_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
free(fai_ref);
return -1;
}
fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
if (fai_ref_len != padded_len) {
- fprintf(pysamerr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len);
+ fprintf(pysam_stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len);
free(fai_ref);
return -1;
}
for (k = 0; k < padded_len; ++k) {
- //fprintf(pysamerr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref));
+ //fprintf(pysam_stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref));
base = fai_ref[k];
if (base == '-' || base == '*') {
gaps += 1;
} else {
int i = seq_nt16_table[(int)base];
if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
- fprintf(pysamerr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
+ fprintf(pysam_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
free(fai_ref);
return -1;
}
int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
b = bam_init1();
+ if (!b) {
+ fprintf(pysam_stderr, "[depad] Couldn't allocate bam struct\n");
+ return -1;
+ }
r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
int read_ret;
while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in'
uint32_t *cigar = bam_get_cigar(b);
n2 = 0;
if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) {
- // fprintf(pysamerr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b));
+ // fprintf(pysam_stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b));
r_tid = b->core.tid;
if (0!=unpad_seq(b, &r)) {
- fprintf(pysamerr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b));
+ fprintf(pysam_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b));
return -1;
};
if (h->target_len[r_tid] != r.l) {
- fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
+ fprintf(pysam_stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
return -1;
}
if (fai) {
// Check the embedded reference matches the FASTA file
if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) {
- fprintf(pysamerr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
+ fprintf(pysam_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
return -1;
}
assert(r.l == q.l);
for (i = 0; i < r.l; ++i) {
if (r.s[i] != q.s[i]) {
// Show gaps as ASCII 45
- fprintf(pysamerr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
+ fprintf(pysam_stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
h->target_name[b->core.tid], i+1,
r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45,
q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45);
} else if (b->core.n_cigar > 0) {
int i, k, op;
if (b->core.tid < 0) {
- fprintf(pysamerr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b));
+ fprintf(pysam_stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b));
return -1;
} else if (b->core.tid == r_tid) {
; // good case, reference available
- //fprintf(pysamerr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b));
+ //fprintf(pysam_stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b));
} else if (fai) {
if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
- fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+ fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
return -1;
}
posmap = update_posmap(posmap, r);
r_tid = b->core.tid;
- // fprintf(pysamerr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
+ // fprintf(pysam_stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
} else {
- fprintf(pysamerr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
+ fprintf(pysam_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
return -1;
}
if (0!=unpad_seq(b, &q)) {
- fprintf(pysamerr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b));
+ fprintf(pysam_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b));
return -1;
};
if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
if (b->core.mtid < 0 || b->core.mpos < 0) {
/* Nice case, no mate to worry about*/
- // fprintf(pysamerr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b));
+ // fprintf(pysam_stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b));
/* TODO - Warning if FLAG says mate should be mapped? */
/* Clean up funny input where mate position is given but mate reference is missing: */
b->core.mtid = -1;
b->core.mpos = -1;
} else if (b->core.mtid == b->core.tid) {
/* Nice case, same reference */
- // fprintf(pysamerr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b));
+ // fprintf(pysam_stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b));
b->core.mpos = posmap[b->core.mpos];
} else {
/* Nasty case, Must load alternative posmap */
- // fprintf(pysamerr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
+ // fprintf(pysam_stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
if (!fai) {
- fprintf(pysamerr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
+ fprintf(pysam_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
return -1;
}
/* Temporarily load the other reference sequence */
if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) {
- fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
+ fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
return -1;
}
posmap = update_posmap(posmap, r);
b->core.mpos = posmap[b->core.mpos];
/* Restore the reference and posmap*/
if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
- fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+ fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
return -1;
}
posmap = update_posmap(posmap, r);
b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b));
next_seq:
- sam_write1(out, h, b);
+ if (sam_write1(out, h, b) < 0) {
+ print_error_errno("depad", "error writing to output");
+ return -1;
+ }
}
if (read_ret < -1) {
- fprintf(pysamerr, "[depad] truncated file.\n");
+ fprintf(pysam_stderr, "[depad] truncated file.\n");
ret = 1;
}
free(r.s); free(q.s); free(posmap);
for (i = 0; i < old->n_targets; ++i) {
unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
if (unpadded_len < 0) {
- fprintf(pysamerr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
+ fprintf(pysam_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
} else {
header->target_len[i] = unpadded_len;
- //fprintf(pysamerr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
+ //fprintf(pysam_stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
}
}
/* Duplicating the header allocated new buffer for header string */
char *name = strstr(text, "\tSN:");
char *name_end;
if (!name) {
- fprintf(pysamerr, "Unable to find SN: header field\n");
+ fprintf(pysam_stderr, "Unable to find SN: header field\n");
return NULL;
}
name += 4;
/* Check we didn't overflow the buffer */
assert (strlen(header->text) <= strlen(old->text));
if (strlen(header->text) < header->l_text) {
- //fprintf(pysamerr, "[depad] Reallocating header buffer\n");
+ //fprintf(pysam_stderr, "[depad] Reallocating header buffer\n");
assert (newtext == header->text);
newtext = malloc(strlen(header->text) + 1);
strcpy(newtext, header->text);
header->text = newtext;
header->l_text = strlen(newtext);
}
- //fprintf(pysamerr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
+ //fprintf(pysam_stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
return header;
}
break;
case '?': is_long_help = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
+ fprintf(pysam_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
return usage(is_long_help);
}
}
}
// open file handlers
if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
- fprintf(pysamerr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
+ print_error_errno("depad", "failed to open \"%s\" for reading", argv[optind]);
ret = 1;
goto depad_end;
}
if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
- fprintf(pysamerr, "[depad] failed to load reference file \"%s\".\n", fn_list);
+ fprintf(pysam_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list);
ret = 1;
goto depad_end;
}
if ((h = sam_hdr_read(in)) == 0) {
- fprintf(pysamerr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ fprintf(pysam_stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
ret = 1;
goto depad_end;
}
if (fai) {
h_fix = fix_header(h, fai);
} else {
- fprintf(pysamerr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
+ fprintf(pysam_stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
h_fix = h;
}
char wmode[2];
strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b");
if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) {
- fprintf(pysamerr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+ print_error_errno("depad", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output");
ret = 1;
goto depad_end;
}
hts_set_opt(out, CRAM_OPT_NO_REF, 1);
if (sam_hdr_write(out, h_fix) != 0) {
- fprintf(pysamerr, "[depad] failed to write header.\n");
+ fprintf(pysam_stderr, "[depad] failed to write header.\n");
ret = 1;
goto depad_end;
}
// Do the depad
- ret = bam_pad2unpad(in, out, h, fai);
+ if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1;
depad_end:
// close files, free and return
if (fai) fai_destroy(fai);
if (h) bam_hdr_destroy(h);
- sam_close(in);
- sam_close(out);
+ if (in) sam_close(in);
+ if (out && sam_close(out) < 0) {
+ fprintf(pysam_stderr, "[depad] error on closing output file.\n");
+ ret = 1;
+ }
free(fn_list); free(fn_out);
return ret;
}
static int usage(int is_long_help)
{
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools depad <in.bam>\n\n");
- fprintf(pysamerr, "Options:\n");
- fprintf(pysamerr, " -s Output is SAM (default is BAM)\n");
- fprintf(pysamerr, " -S Input is SAM (default is BAM)\n");
- fprintf(pysamerr, " -u Uncompressed BAM output (can't use with -s)\n");
- fprintf(pysamerr, " -1 Fast compression BAM output (can't use with -s)\n");
- fprintf(pysamerr, " -T, --reference FILE\n");
- fprintf(pysamerr, " Padded reference sequence file [null]\n");
- fprintf(pysamerr, " -o FILE Output file name [stdout]\n");
- fprintf(pysamerr, " -? Longer help\n");
- sam_global_opt_help(pysamerr, "-...-");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Usage: samtools depad <in.bam>\n\n");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -s Output is SAM (default is BAM)\n");
+ fprintf(pysam_stderr, " -S Input is SAM (default is BAM)\n");
+ fprintf(pysam_stderr, " -u Uncompressed BAM output (can't use with -s)\n");
+ fprintf(pysam_stderr, " -1 Fast compression BAM output (can't use with -s)\n");
+ fprintf(pysam_stderr, " -T, --reference FILE\n");
+ fprintf(pysam_stderr, " Padded reference sequence file [null]\n");
+ fprintf(pysam_stderr, " -o FILE Output file name [pysam_stdout]\n");
+ fprintf(pysam_stderr, " -? Longer help\n");
+ sam_global_opt_help(pysam_stderr, "-...-");
if (is_long_help)
- fprintf(pysamerr, "Notes:\n\
-\n\
- 1. Requires embedded reference sequences (before the reads for that reference),\n\
- or ideally a FASTA file of the padded reference sequences (via the -T argument).\n\
-\n\
- 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\
-\n");
+ fprintf(pysam_stderr,
+"Notes:\n"
+"\n"
+"1. Requires embedded reference sequences (before the reads for that reference),\n"
+" or ideally a FASTA file of the padded reference sequences (via a -T option).\n"
+"\n"
+"2. Input padded alignment reads' CIGAR strings must not use P or I operators.\n"
+"\n");
return 1;
}
/* phase.c -- phase subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <zlib.h>
#include "htslib/sam.h"
+#include "htslib/kstring.h"
#include "errmod.h"
#include "sam_opts.h"
+#include "samtools.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
samFile* fp;
bam_hdr_t* fp_hdr;
char *pre;
+ char *out_name[3];
samFile* out[3];
bam_hdr_t* out_hdr[3];
// alignment queue
return ret;
}
-static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+static int dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
{
int i, is_flip, drop_ambi;
drop_ambi = g->flag & FLAG_DROP_AMBI;
if (which < 2 && is_flip) which = 1 - which; // increase the randomness
}
if (which == 3) which = (drand48() < 0.5);
- sam_write1(g->out[which], g->out_hdr[which], b);
+ if (sam_write1(g->out[which], g->out_hdr[which], b) < 0) {
+ print_error_errno("phase", "error writing to '%s'", g->out_name[which]);
+ return -1;
+ }
bam_destroy1(b);
g->b[i] = 0;
}
memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
g->n -= i;
+ return 0;
}
static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
else f->phased = 1, f->phase = f->seq[0] - 1;
}
}
- dump_aln(g, min_pos, hash);
+ if (dump_aln(g, min_pos, hash) < 0) return -1;
++g->vpos_shift;
return 1;
}
printf("//\n");
fflush(stdout);
g->vpos_shift += vpos;
- dump_aln(g, min_pos, hash);
+ if (dump_aln(g, min_pos, hash) < 0) return -1;
return vpos;
}
return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
}
+static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat *fmt)
+{
+ kstring_t s = { 0, 0, NULL };
+ ksprintf(&s, "%s.%s.%s", g->pre, middle, hts_format_file_extension(fmt));
+ g->out_name[c] = ks_release(&s);
+ g->out[c] = sam_open_format(g->out_name[c], "wb", fmt);
+ if (! g->out[c]) {
+ print_error_errno("phase", "Failed to open output file '%s'", g->out_name[c]);
+ return -1;
+ }
+
+ g->out_hdr[c] = bam_hdr_dup(g->fp_hdr);
+ if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) {
+ print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]);
+ return -1;
+ }
+
+ return 0;
+}
+
int main_phase(int argc, char *argv[])
{
int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
{ NULL, 0, NULL, 0 }
};
+ // FIXME Leaks galore in the case of error returns
+
memset(&g, 0, sizeof(phaseg_t));
g.flag = FLAG_FIX_CHIMERA;
g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
return 1;
}
g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ if (!g.fp) {
+ print_error_errno("phase", "Couldn't open '%s'", argv[optind]);
+ return 1;
+ }
g.fp_hdr = sam_hdr_read(g.fp);
if (g.fp_hdr == NULL) {
- fprintf(stderr, "Failed to read header for '%s'\n", argv[optind]);
+ fprintf(stderr, "[%s] Failed to read header for '%s'\n",
+ __func__, argv[optind]);
return 1;
}
if (fn_list) { // read the list of sites to phase
free(fn_list);
} else g.flag &= ~FLAG_LIST_EXCL;
if (g.pre) { // open BAMs to write
- char *s = (char*)malloc(strlen(g.pre) + 20);
if (ga.out.format == unknown_format)
ga.out.format = bam; // default via "wb".
- strcpy(s, g.pre); strcat(s, ".0."); strcat(s, hts_format_file_extension(&ga.out));
- g.out[0] = sam_open_format(s, "wb", &ga.out);
- strcpy(s, g.pre); strcat(s, ".1."); strcat(s, hts_format_file_extension(&ga.out));
- g.out[1] = sam_open_format(s, "wb", &ga.out);
- strcpy(s, g.pre); strcat(s, ".chimera."); strcat(s, hts_format_file_extension(&ga.out));
- g.out[2] = sam_open_format(s, "wb", &ga.out);
- for (c = 0; c <= 2; ++c) {
- g.out_hdr[c] = bam_hdr_dup(g.fp_hdr);
- sam_hdr_write(g.out[c], g.out_hdr[c]);
- }
- free(s);
+
+ // Open each output file g.out[0..2], dupping and writing the header
+ if (start_output(&g, 0, "0", &ga.out) < 0 ||
+ start_output(&g, 1, "1", &ga.out) < 0 ||
+ start_output(&g, 2, "chimera", &ga.out) < 0) return 1;
}
iter = bam_plp_init(readaln, &g);
g.vpos_shift = 0;
if (lasttid >= 0) {
seqs = shrink_hash(seqs);
- phase(&g, g.fp_hdr->target_name[lasttid], vpos, cns, seqs);
+ if (phase(&g, g.fp_hdr->target_name[lasttid],
+ vpos, cns, seqs) < 0) {
+ return 1;
+ }
update_vpos(0x7fffffff, seqs);
}
lasttid = tid;
}
if (dophase) {
seqs = shrink_hash(seqs);
- phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs);
+ if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) {
+ return 1;
+ }
update_vpos(vpos, seqs);
cns[0] = cns[vpos];
vpos = 0;
}
++vpos;
}
- if (tid >= 0) phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs);
+ if (tid >= 0) {
+ if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) {
+ return 1;
+ }
+ }
bam_hdr_destroy(g.fp_hdr);
bam_plp_destroy(iter);
sam_close(g.fp);
errmod_destroy(em);
free(bases);
if (g.pre) {
+ int res = 0;
for (c = 0; c <= 2; ++c) {
- sam_close(g.out[c]);
+ if (sam_close(g.out[c]) < 0) {
+ fprintf(stderr, "[%s] error on closing '%s'\n",
+ __func__, g.out_name[c]);
+ res = 1;
+ }
bam_hdr_destroy(g.out_hdr[c]);
+ free(g.out_name[c]);
}
free(g.pre); free(g.b);
+ if (res) return 1;
}
sam_global_args_free(&ga);
return 0;
/* phase.c -- phase subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2013, 2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <zlib.h>
#include "htslib/sam.h"
+#include "htslib/kstring.h"
#include "errmod.h"
#include "sam_opts.h"
+#include "samtools.h"
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
samFile* fp;
bam_hdr_t* fp_hdr;
char *pre;
+ char *out_name[3];
samFile* out[3];
bam_hdr_t* out_hdr[3];
// alignment queue
return ret;
}
-static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+static int dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
{
int i, is_flip, drop_ambi;
drop_ambi = g->flag & FLAG_DROP_AMBI;
if (which < 2 && is_flip) which = 1 - which; // increase the randomness
}
if (which == 3) which = (drand48() < 0.5);
- sam_write1(g->out[which], g->out_hdr[which], b);
+ if (sam_write1(g->out[which], g->out_hdr[which], b) < 0) {
+ print_error_errno("phase", "error writing to '%s'", g->out_name[which]);
+ return -1;
+ }
bam_destroy1(b);
g->b[i] = 0;
}
memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
g->n -= i;
+ return 0;
}
static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
min_pos = i? cns[vpos]>>32 : 0x7fffffff;
if (vpos == 1) {
- printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
- printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+ fprintf(pysam_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+ fprintf(pysam_stdout, "M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
"ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
for (k = 0; k < kh_end(hash); ++k) {
if (kh_exist(hash, k)) {
else f->phased = 1, f->phase = f->seq[0] - 1;
}
}
- dump_aln(g, min_pos, hash);
+ if (dump_aln(g, min_pos, hash) < 0) return -1;
++g->vpos_shift;
return 1;
}
{ // phase
int **cnt;
uint64_t *mask;
- printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+ fprintf(pysam_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
sitemask = calloc(vpos, 1);
cnt = count_all(g->k, vpos, hash);
path = dynaprog(g->k, vpos, cnt);
}
}
for (i = 0; i < n_masked; ++i)
- printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+ fprintf(pysam_stdout, "FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
for (i = 0; i < vpos; ++i) {
uint64_t x = pcnt[i];
int8_t c[2];
c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
- printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+ fprintf(pysam_stdout, "M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
}
free(path); free(pcnt); free(regmask); free(sitemask);
ks_introsort_rseq(n_seqs, seqs);
for (i = 0; i < n_seqs; ++i) {
frag_t *f = seqs[i];
- printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+ fprintf(pysam_stdout, "EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
for (j = 0; j < f->vlen; ++j) {
uint32_t c = cns[f->vpos + j];
- if (f->seq[j] == 0) putchar('N');
- else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+ if (f->seq[j] == 0) fputc('N', pysam_stdout);
+ else fputc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], pysam_stdout);
}
- printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
+ fprintf(pysam_stdout, "\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
}
free(seqs);
- printf("//\n");
- fflush(stdout);
+ fprintf(pysam_stdout, "//\n");
+ fflush(pysam_stdout);
g->vpos_shift += vpos;
- dump_aln(g, min_pos, hash);
+ if (dump_aln(g, min_pos, hash) < 0) return -1;
return vpos;
}
return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
}
+static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat *fmt)
+{
+ kstring_t s = { 0, 0, NULL };
+ ksprintf(&s, "%s.%s.%s", g->pre, middle, hts_format_file_extension(fmt));
+ g->out_name[c] = ks_release(&s);
+ g->out[c] = sam_open_format(g->out_name[c], "wb", fmt);
+ if (! g->out[c]) {
+ print_error_errno("phase", "Failed to open output file '%s'", g->out_name[c]);
+ return -1;
+ }
+
+ g->out_hdr[c] = bam_hdr_dup(g->fp_hdr);
+ if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) {
+ print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]);
+ return -1;
+ }
+
+ return 0;
+}
+
int main_phase(int argc, char *argv[])
{
int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
{ NULL, 0, NULL, 0 }
};
+ // FIXME Leaks galore in the case of error returns
+
memset(&g, 0, sizeof(phaseg_t));
g.flag = FLAG_FIX_CHIMERA;
g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
if (usage) break;
}
if (usage || argc == optind) {
- fprintf(pysamerr, "\n");
- fprintf(pysamerr, "Usage: samtools phase [options] <in.bam>\n\n");
- fprintf(pysamerr, "Options: -k INT block length [%d]\n", g.k);
- fprintf(pysamerr, " -b STR prefix of BAMs to output [null]\n");
- fprintf(pysamerr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
- fprintf(pysamerr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
- fprintf(pysamerr, " -D INT max read depth [%d]\n", g.max_depth);
-// fprintf(pysamerr, " -l FILE list of sites to phase [null]\n");
- fprintf(pysamerr, " -F do not attempt to fix chimeras\n");
- fprintf(pysamerr, " -A drop reads with ambiguous phase\n");
-// fprintf(pysamerr, " -e do not discover SNPs (effective with -l)\n");
- fprintf(pysamerr, "\n");
-
- sam_global_opt_help(pysamerr, "-....");
+ fprintf(pysam_stderr, "\n");
+ fprintf(pysam_stderr, "Usage: samtools phase [options] <in.bam>\n\n");
+ fprintf(pysam_stderr, "Options: -k INT block length [%d]\n", g.k);
+ fprintf(pysam_stderr, " -b STR prefix of BAMs to output [null]\n");
+ fprintf(pysam_stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
+ fprintf(pysam_stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
+ fprintf(pysam_stderr, " -D INT max read depth [%d]\n", g.max_depth);
+// fprintf(pysam_stderr, " -l FILE list of sites to phase [null]\n");
+ fprintf(pysam_stderr, " -F do not attempt to fix chimeras\n");
+ fprintf(pysam_stderr, " -A drop reads with ambiguous phase\n");
+// fprintf(pysam_stderr, " -e do not discover SNPs (effective with -l)\n");
+ fprintf(pysam_stderr, "\n");
+
+ sam_global_opt_help(pysam_stderr, "-....");
return 1;
}
g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ if (!g.fp) {
+ print_error_errno("phase", "Couldn't open '%s'", argv[optind]);
+ return 1;
+ }
g.fp_hdr = sam_hdr_read(g.fp);
if (g.fp_hdr == NULL) {
- fprintf(pysamerr, "Failed to read header for '%s'\n", argv[optind]);
+ fprintf(pysam_stderr, "[%s] Failed to read header for '%s'\n",
+ __func__, argv[optind]);
return 1;
}
if (fn_list) { // read the list of sites to phase
free(fn_list);
} else g.flag &= ~FLAG_LIST_EXCL;
if (g.pre) { // open BAMs to write
- char *s = (char*)malloc(strlen(g.pre) + 20);
if (ga.out.format == unknown_format)
ga.out.format = bam; // default via "wb".
- strcpy(s, g.pre); strcat(s, ".0."); strcat(s, hts_format_file_extension(&ga.out));
- g.out[0] = sam_open_format(s, "wb", &ga.out);
- strcpy(s, g.pre); strcat(s, ".1."); strcat(s, hts_format_file_extension(&ga.out));
- g.out[1] = sam_open_format(s, "wb", &ga.out);
- strcpy(s, g.pre); strcat(s, ".chimera."); strcat(s, hts_format_file_extension(&ga.out));
- g.out[2] = sam_open_format(s, "wb", &ga.out);
- for (c = 0; c <= 2; ++c) {
- g.out_hdr[c] = bam_hdr_dup(g.fp_hdr);
- sam_hdr_write(g.out[c], g.out_hdr[c]);
- }
- free(s);
+
+ // Open each output file g.out[0..2], dupping and writing the header
+ if (start_output(&g, 0, "0", &ga.out) < 0 ||
+ start_output(&g, 1, "1", &ga.out) < 0 ||
+ start_output(&g, 2, "chimera", &ga.out) < 0) return 1;
}
iter = bam_plp_init(readaln, &g);
seqs = kh_init(64);
em = errmod_init(1. - 0.83);
bases = calloc(g.max_depth, 2);
- printf("CC\n");
- printf("CC\tDescriptions:\nCC\n");
- printf("CC\t CC comments\n");
- printf("CC\t PS start of a phase set\n");
- printf("CC\t FL filtered region\n");
- printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
- printf("CC\t EV supporting reads; SAM format\n");
- printf("CC\t // end of a phase set\nCC\n");
- printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
- printf("CC\t PS chr phaseSetStart phaseSetEnd\n");
- printf("CC\t FL chr filterStart filterEnd\n");
- printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
- printf("CC\nCC\n");
- fflush(stdout);
+ fprintf(pysam_stdout, "CC\n");
+ fprintf(pysam_stdout, "CC\tDescriptions:\nCC\n");
+ fprintf(pysam_stdout, "CC\t CC comments\n");
+ fprintf(pysam_stdout, "CC\t PS start of a phase set\n");
+ fprintf(pysam_stdout, "CC\t FL filtered region\n");
+ fprintf(pysam_stdout, "CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+ fprintf(pysam_stdout, "CC\t EV supporting reads; SAM format\n");
+ fprintf(pysam_stdout, "CC\t // end of a phase set\nCC\n");
+ fprintf(pysam_stdout, "CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+ fprintf(pysam_stdout, "CC\t PS chr phaseSetStart phaseSetEnd\n");
+ fprintf(pysam_stdout, "CC\t FL chr filterStart filterEnd\n");
+ fprintf(pysam_stdout, "CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
+ fprintf(pysam_stdout, "CC\nCC\n");
+ fflush(pysam_stdout);
while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
int i, k, c, tmp, dophase = 1, in_set = 0;
float q[16];
g.vpos_shift = 0;
if (lasttid >= 0) {
seqs = shrink_hash(seqs);
- phase(&g, g.fp_hdr->target_name[lasttid], vpos, cns, seqs);
+ if (phase(&g, g.fp_hdr->target_name[lasttid],
+ vpos, cns, seqs) < 0) {
+ return 1;
+ }
update_vpos(0x7fffffff, seqs);
}
lasttid = tid;
}
if (dophase) {
seqs = shrink_hash(seqs);
- phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs);
+ if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) {
+ return 1;
+ }
update_vpos(vpos, seqs);
cns[0] = cns[vpos];
vpos = 0;
}
++vpos;
}
- if (tid >= 0) phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs);
+ if (tid >= 0) {
+ if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) {
+ return 1;
+ }
+ }
bam_hdr_destroy(g.fp_hdr);
bam_plp_destroy(iter);
sam_close(g.fp);
errmod_destroy(em);
free(bases);
if (g.pre) {
+ int res = 0;
for (c = 0; c <= 2; ++c) {
- sam_close(g.out[c]);
+ if (sam_close(g.out[c]) < 0) {
+ fprintf(pysam_stderr, "[%s] error on closing '%s'\n",
+ __func__, g.out_name[c]);
+ res = 1;
+ }
bam_hdr_destroy(g.out_hdr[c]);
+ free(g.out_name[c]);
}
free(g.pre); free(g.b);
+ if (res) return 1;
}
sam_global_args_free(&ga);
return 0;
#ifndef PYSAM_H
#define PYSAM_H
#include "stdio.h"
-extern FILE * pysamerr;
+extern FILE * pysam_stderr;
+extern FILE * pysam_stdout;
+extern const char * pysam_stdout_fn;
#endif
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <string.h>
#include <unistd.h>
#include "htslib/faidx.h"
int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
{
if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1;
- bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
+ if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1;
return 0;
}
if (hts_fp == NULL) return NULL;
samfile_t *fp = malloc(sizeof (samfile_t));
+ if (!fp) {
+ sam_close(hts_fp);
+ return NULL;
+ }
fp->file = hts_fp;
fp->x.bam = hts_fp->fp.bgzf;
if (strchr(mode, 'r')) {
enum htsExactFormat fmt = hts_get_format(fp->file)->format;
fp->header = (bam_hdr_t *)aux; // For writing, we won't free it
fp->is_write = 1;
- if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
+ if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) {
+ if (sam_hdr_write(fp->file, fp->header) < 0) {
+ if (bam_verbose >= 1)
+ fprintf(stderr, "[samopen] Couldn't write header\n");
+ sam_close(hts_fp);
+ free(fp);
+ return NULL;
+ }
+ }
}
return fp;
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <string.h>
#include <unistd.h>
#include "htslib/faidx.h"
int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
{
if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1;
- bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
+ if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1;
return 0;
}
if (hts_fp == NULL) return NULL;
samfile_t *fp = malloc(sizeof (samfile_t));
+ if (!fp) {
+ sam_close(hts_fp);
+ return NULL;
+ }
fp->file = hts_fp;
fp->x.bam = hts_fp->fp.bgzf;
if (strchr(mode, 'r')) {
}
fp->is_write = 0;
if (fp->header->n_targets == 0 && bam_verbose >= 1)
- fprintf(pysamerr, "[samopen] no @SQ lines in the header.\n");
+ fprintf(pysam_stderr, "[samopen] no @SQ lines in the header.\n");
}
else {
enum htsExactFormat fmt = hts_get_format(fp->file)->format;
fp->header = (bam_hdr_t *)aux; // For writing, we won't free it
fp->is_write = 1;
- if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header);
+ if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) {
+ if (sam_hdr_write(fp->file, fp->header) < 0) {
+ if (bam_verbose >= 1)
+ fprintf(pysam_stderr, "[samopen] Couldn't write header\n");
+ sam_close(hts_fp);
+ free(fp);
+ return NULL;
+ }
+ }
}
return fp;
strcat(strcpy(fn_list, fn_ref), ".fai");
if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
if (access(fn_ref, R_OK) == -1) {
- fprintf(pysamerr, "[samfaipath] fail to read file %s.\n", fn_ref);
+ fprintf(pysam_stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
} else {
- if (bam_verbose >= 3) fprintf(pysamerr, "[samfaipath] build FASTA index...\n");
+ if (bam_verbose >= 3) fprintf(pysam_stderr, "[samfaipath] build FASTA index...\n");
if (fai_build(fn_ref) == -1) {
- fprintf(pysamerr, "[samfaipath] fail to build FASTA index.\n");
+ fprintf(pysam_stderr, "[samfaipath] fail to build FASTA index.\n");
free(fn_list); fn_list = 0;
}
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "sam_header.h"
#include <stdio.h>
#include <string.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "sam_header.h"
#include <stdio.h>
#include <string.h>
{
va_list ap;
va_start(ap, format);
- vfprintf(pysamerr, format, ap);
+ vfprintf(pysam_stderr, format, ap);
va_end(ap);
}
if ( status==2 )
{
- print_header_line(pysamerr,tmpl_hlines->data);
- print_header_line(pysamerr,out_hlines->data);
+ print_header_line(pysam_stderr,tmpl_hlines->data);
+ print_header_line(pysam_stderr,out_hlines->data);
debug("Conflicting lines, cannot merge the headers.\n");
return 0;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
}
if (!lopt->name) {
- fprintf(pysamerr, "Unexpected global option: %s\n", lopt->name);
+ fprintf(pysam_stderr, "Unexpected global option: %s\n", lopt->name);
return -1;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
}
if (settings->library) {
const char *p = bam_get_library((bam_hdr_t*)h, b);
- if (p && strcmp(p, settings->library) != 0) return 1;
+ if (!p || strcmp(p, settings->library) != 0) return 1;
}
if (settings->remove_aux_len) {
size_t i;
}
}
if (fn_un_out) {
- if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) {
+ if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) {
print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out);
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(un_out, fn_list) != 0) {
- fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
- ret = 1;
- goto view_end;
- }
+ if (fn_list) {
+ if (hts_set_fai_filename(un_out, fn_list) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
}
+ }
if (*out_format || is_header ||
out_un_mode[1] == 'b' || out_un_mode[1] == 'c' ||
(ga.out.format != sam && ga.out.format != unknown_format)) {
fprintf(fp,
"Notes:\n"
"\n"
-" 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n"
-" Further control over the CRAM format can be specified by using the\n"
-" --output-fmt-option, e.g. to specify the number of sequences per slice\n"
-" and to use avoid reference based compression:\n"
-" `samtools view -C --output-fmt-option seqs_per_slice=5000 \\\n"
-" --output-fmt-option no_ref -o out.cram in.bam'\n"
+"1. This command now auto-detects the input format (BAM/CRAM/SAM).\n"
+" Further control over the CRAM format can be specified by using the\n"
+" --output-fmt-option, e.g. to specify the number of sequences per slice\n"
+" and to use avoid reference based compression:\n"
"\n"
-" Options can also be specified as a comma separated list within the\n"
-" --output-fmt value too. For example this is equivalent to the above\n"
-" `samtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n"
-" -o out.cram in.bam'\n"
+"\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n"
+"\t --output-fmt-option no_ref -o out.cram in.bam\n"
"\n"
-" 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n"
-" two fields of each line consisting of the reference name and the\n"
-" corresponding sequence length. The `.fai' file generated by \n"
-" `samtools faidx' is suitable for use as this file. This may be an\n"
-" empty file if reads are unaligned.\n"
+" Options can also be specified as a comma separated list within the\n"
+" --output-fmt value too. For example this is equivalent to the above\n"
"\n"
-" 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n"
+"\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n"
+"\t -o out.cram in.bam\n"
"\n"
-" 4. BAM->SAM conversion: `samtools view -h in.bam'.\n"
+"2. The file supplied with `-t' is SPACE/TAB delimited with the first\n"
+" two fields of each line consisting of the reference name and the\n"
+" corresponding sequence length. The `.fai' file generated by \n"
+" `samtools faidx' is suitable for use as this file. This may be an\n"
+" empty file if reads are unaligned.\n"
"\n"
-" 5. A region should be presented in one of the following formats:\n"
-" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n"
-" specified, the input alignment file must be a sorted and indexed\n"
-" alignment (BAM/CRAM) file.\n"
+"3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n"
"\n"
-" 6. Option `-u' is preferred over `-b' when the output is piped to\n"
-" another samtools command.\n"
+"4. BAM->SAM conversion: samtools view -h in.bam\n"
+"\n"
+"5. A region should be presented in one of the following formats:\n"
+" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n"
+" specified, the input alignment file must be a sorted and indexed\n"
+" alignment (BAM/CRAM) file.\n"
+"\n"
+"6. Option `-u' is preferred over `-b' when the output is piped to\n"
+" another samtools command.\n"
"\n");
return exit_status;
static void bam2fq_usage(FILE *to, const char *command)
{
+ int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
fprintf(to,
"Usage: samtools %s [options...] <in.bam>\n", command);
fprintf(to,
" -2 FILE write paired reads flagged READ2 to FILE\n"
" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
-" -n don't append /1 and /2 to the read name\n"
-" -O output quality in the OQ tag if present\n"
+" -n don't append /1 and /2 to the read name\n");
+ if (fq) fprintf(to,
+" -O output quality in the OQ tag if present\n");
+ fprintf(to,
" -s FILE write singleton reads to FILE [assume single-end]\n"
-" -t copy RG, BC and QT tags to the FASTQ header line\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
+ fq ? "FASTQ" : "FASTA");
+ if (fq) fprintf(to,
" -v INT default quality score if not given in file [1]\n");
sam_global_opt_help(to, "-.--.");
}
uint8_t *seq;
uint8_t *qual = bam_get_qual(b);
const uint8_t *oq = NULL;
- if (state->use_oq) oq = bam_aux_get(b, "OQ") + 1;
+ if (state->use_oq) {
+ oq = bam_aux_get(b, "OQ");
+ if (oq) oq++; // skip tag type
+ }
bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
linebuf->l = 0;
bool valid = true;
while (true) {
- at_eof = sam_read1(state->fp, state->h, b);
+ at_eof = sam_read1(state->fp, state->h, b) < 0;
if (!at_eof && filter_it_out(b, state)) continue;
if (!at_eof) ++n_reads;
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
}
if (settings->library) {
const char *p = bam_get_library((bam_hdr_t*)h, b);
- if (p && strcmp(p, settings->library) != 0) return 1;
+ if (!p || strcmp(p, settings->library) != 0) return 1;
}
if (settings->remove_aux_len) {
size_t i;
case 'x':
{
if (strlen(optarg) != 2) {
- fprintf(pysamerr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
- return usage(pysamerr, EXIT_FAILURE, is_long_help);
+ fprintf(pysam_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
+ return usage(pysam_stderr, EXIT_FAILURE, is_long_help);
}
settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
settings.remove_aux[settings.remove_aux_len-1] = optarg;
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
- return usage(pysamerr, EXIT_FAILURE, is_long_help);
+ return usage(pysam_stderr, EXIT_FAILURE, is_long_help);
break;
}
}
strcat(out_mode, tmp);
strcat(out_un_mode, tmp);
}
- if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak...
+ if (argc == optind && isatty(STDIN_FILENO)) return usage(pysam_stdout, EXIT_SUCCESS, is_long_help); // potential memory leak...
fn_in = (optind < argc)? argv[optind] : "-";
// generate the fn_list if necessary
if (fn_list) {
if (hts_set_fai_filename(in, fn_list) != 0) {
- fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
ret = 1;
goto view_end;
}
}
if ((header = sam_hdr_read(in)) == 0) {
- fprintf(pysamerr, "[main_samview] fail to read the header from \"%s\".\n", fn_in);
+ fprintf(pysam_stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in);
ret = 1;
goto view_end;
}
}
if (fn_list) {
if (hts_set_fai_filename(out, fn_list) != 0) {
- fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
ret = 1;
goto view_end;
}
out_mode[1] == 'b' || out_mode[1] == 'c' ||
(ga.out.format != sam && ga.out.format != unknown_format)) {
if (sam_hdr_write(out, header) != 0) {
- fprintf(pysamerr, "[main_samview] failed to write the SAM header\n");
+ fprintf(pysam_stderr, "[main_samview] failed to write the SAM header\n");
ret = 1;
goto view_end;
}
}
if (fn_un_out) {
- if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) {
+ if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) {
print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out);
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(un_out, fn_list) != 0) {
- fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
- ret = 1;
- goto view_end;
- }
+ if (fn_list) {
+ if (hts_set_fai_filename(un_out, fn_list) != 0) {
+ fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ ret = 1;
+ goto view_end;
}
+ }
if (*out_format || is_header ||
out_un_mode[1] == 'b' || out_un_mode[1] == 'c' ||
(ga.out.format != sam && ga.out.format != unknown_format)) {
if (sam_hdr_write(un_out, header) != 0) {
- fprintf(pysamerr, "[main_samview] failed to write the SAM header\n");
+ fprintf(pysam_stderr, "[main_samview] failed to write the SAM header\n");
ret = 1;
goto view_end;
}
}
}
if (r < -1) {
- fprintf(pysamerr, "[main_samview] truncated file.\n");
+ fprintf(pysam_stderr, "[main_samview] truncated file.\n");
ret = 1;
}
bam_destroy1(b);
bam1_t *b;
hts_idx_t *idx = sam_index_load(in, fn_in); // load index
if (idx == 0) { // index is unavailable
- fprintf(pysamerr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n");
+ fprintf(pysam_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n");
ret = 1;
goto view_end;
}
if (iter == NULL) { // region invalid or reference name not found
int beg, end;
if (hts_parse_reg(argv[i], &beg, &end))
- fprintf(pysamerr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
+ fprintf(pysam_stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
else
- fprintf(pysamerr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]);
+ fprintf(pysam_stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]);
continue;
}
// fetch alignments
}
hts_itr_destroy(iter);
if (result < -1) {
- fprintf(pysamerr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
+ fprintf(pysam_stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
ret = 1;
break;
}
view_end:
if (is_count && ret == 0)
- printf("%" PRId64 "\n", count);
+ fprintf(pysam_stdout, "%" PRId64 "\n", count);
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
" -h include header in SAM output\n"
" -H print SAM header only (no alignments)\n"
" -c print only the count of matching records\n"
-" -o FILE output file name [stdout]\n"
+" -o FILE output file name [pysam_stdout]\n"
" -U FILE output reads not selected by filters to FILE [null]\n"
// extra input
" -t FILE FILE listing reference names and lengths (see long help) [null]\n"
fprintf(fp,
"Notes:\n"
"\n"
-" 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n"
-" Further control over the CRAM format can be specified by using the\n"
-" --output-fmt-option, e.g. to specify the number of sequences per slice\n"
-" and to use avoid reference based compression:\n"
-" `samtools view -C --output-fmt-option seqs_per_slice=5000 \\\n"
-" --output-fmt-option no_ref -o out.cram in.bam'\n"
+"1. This command now auto-detects the input format (BAM/CRAM/SAM).\n"
+" Further control over the CRAM format can be specified by using the\n"
+" --output-fmt-option, e.g. to specify the number of sequences per slice\n"
+" and to use avoid reference based compression:\n"
"\n"
-" Options can also be specified as a comma separated list within the\n"
-" --output-fmt value too. For example this is equivalent to the above\n"
-" `samtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n"
-" -o out.cram in.bam'\n"
+"\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n"
+"\t --output-fmt-option no_ref -o out.cram in.bam\n"
"\n"
-" 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n"
-" two fields of each line consisting of the reference name and the\n"
-" corresponding sequence length. The `.fai' file generated by \n"
-" `samtools faidx' is suitable for use as this file. This may be an\n"
-" empty file if reads are unaligned.\n"
+" Options can also be specified as a comma separated list within the\n"
+" --output-fmt value too. For example this is equivalent to the above\n"
"\n"
-" 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n"
+"\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n"
+"\t -o out.cram in.bam\n"
"\n"
-" 4. BAM->SAM conversion: `samtools view -h in.bam'.\n"
+"2. The file supplied with `-t' is SPACE/TAB delimited with the first\n"
+" two fields of each line consisting of the reference name and the\n"
+" corresponding sequence length. The `.fai' file generated by \n"
+" `samtools faidx' is suitable for use as this file. This may be an\n"
+" empty file if reads are unaligned.\n"
"\n"
-" 5. A region should be presented in one of the following formats:\n"
-" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n"
-" specified, the input alignment file must be a sorted and indexed\n"
-" alignment (BAM/CRAM) file.\n"
+"3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n"
"\n"
-" 6. Option `-u' is preferred over `-b' when the output is piped to\n"
-" another samtools command.\n"
+"4. BAM->SAM conversion: samtools view -h in.bam\n"
+"\n"
+"5. A region should be presented in one of the following formats:\n"
+" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n"
+" specified, the input alignment file must be a sorted and indexed\n"
+" alignment (BAM/CRAM) file.\n"
+"\n"
+"6. Option `-u' is preferred over `-b' when the output is piped to\n"
+" another samtools command.\n"
"\n");
return exit_status;
int argc2, ret;
char **argv2;
if (argc != 4) {
- fprintf(pysamerr, "Usage: samtools import <in.ref_list> <in.sam> <out.bam>\n");
+ fprintf(pysam_stderr, "Usage: samtools import <in.ref_list> <in.sam> <out.bam>\n");
return 1;
}
argc2 = 6;
static void bam2fq_usage(FILE *to, const char *command)
{
+ int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
fprintf(to,
"Usage: samtools %s [options...] <in.bam>\n", command);
fprintf(to,
" -2 FILE write paired reads flagged READ2 to FILE\n"
" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
-" -n don't append /1 and /2 to the read name\n"
-" -O output quality in the OQ tag if present\n"
+" -n don't append /1 and /2 to the read name\n");
+ if (fq) fprintf(to,
+" -O output quality in the OQ tag if present\n");
+ fprintf(to,
" -s FILE write singleton reads to FILE [assume single-end]\n"
-" -t copy RG, BC and QT tags to the FASTQ header line\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
+ fq ? "FASTQ" : "FASTA");
+ if (fq) fprintf(to,
" -v INT default quality score if not given in file [1]\n");
sam_global_opt_help(to, "-.--.");
}
uint8_t *seq;
uint8_t *qual = bam_get_qual(b);
const uint8_t *oq = NULL;
- if (state->use_oq) oq = bam_aux_get(b, "OQ") + 1;
+ if (state->use_oq) {
+ oq = bam_aux_get(b, "OQ");
+ if (oq) oq++; // skip tag type
+ }
bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
linebuf->l = 0;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(pysamerr, argv[0]); free(opts); return false;
+ case '?': bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(pysamerr, argv[0]); free(opts); return false;
+ bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false;
}
break;
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
if (opts->def_qual < 0 || 93 < opts->def_qual) {
- fprintf(pysamerr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
- bam2fq_usage(pysamerr, argv[0]);
+ fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
+ bam2fq_usage(pysam_stderr, argv[0]);
free(opts);
return true;
}
opts->filetype = FASTA;
} else {
print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
- bam2fq_usage(pysamerr, argv[0]);
+ bam2fq_usage(pysam_stderr, argv[0]);
free(opts);
return false;
}
if ((argc - (optind)) == 0) {
- bam2fq_usage(stdout, argv[0]);
+ bam2fq_usage(pysam_stdout, argv[0]);
free(opts);
return false;
}
if ((argc - (optind)) != 1) {
- fprintf(pysamerr, "Too many arguments.\n");
- bam2fq_usage(pysamerr, argv[0]);
+ fprintf(pysam_stderr, "Too many arguments.\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
free(opts);
return false;
}
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
if (opts->use_oq) rf |= SAM_AUX;
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
free(state);
return false;
}
if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) {
- fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
free(state);
return false;
}
return false;
}
} else {
- state->fpr[i] = stdout;
+ state->fpr[i] = pysam_stdout;
}
}
state->h = sam_hdr_read(state->fp);
if (state->h == NULL) {
- fprintf(pysamerr, "Failed to read header for \"%s\"\n", opts->fn_input);
+ fprintf(pysam_stderr, "Failed to read header for \"%s\"\n", opts->fn_input);
free(state);
return false;
}
if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
int i;
for (i = 0; i < 3; ++i) {
- if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+ if (state->fpr[i] != pysam_stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
}
free(state);
return valid;
bool valid = true;
while (true) {
- at_eof = sam_read1(state->fp, state->h, b);
+ at_eof = sam_read1(state->fp, state->h, b) < 0;
if (!at_eof && filter_it_out(b, state)) continue;
if (!at_eof) ++n_reads;
int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
if (b_score > score[which_readpart(b)]) {
if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) {
- fprintf(pysamerr, "[%s] Error converting read to FASTA/Q\n", __func__);
+ fprintf(pysam_stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
return false;
}
score[which_readpart(b)] = b_score;
free(linebuf[0].s);
free(linebuf[1].s);
free(linebuf[2].s);
- fprintf(pysamerr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
- fprintf(pysamerr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+ fprintf(pysam_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
+ fprintf(pysam_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
return valid;
}
free(linebuf.s);
bam_destroy1(b);
- fprintf(pysamerr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+ fprintf(pysam_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
return true;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include "sample.h"
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdlib.h>
#include <string.h>
#include "sample.h"
*/
+#include <config.h>
+
#include <unistd.h> // for isatty()
#include <stdio.h>
#include <stdlib.h>
*/
+#include <config.h>
+
#include <unistd.h> // for isatty()
#include <stdio.h>
#include <stdlib.h>
if ( tid < 0 )
{
if ( !warned )
- fprintf(pysamerr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s);
+ fprintf(pysam_stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s);
warned = 1;
continue;
}
{
khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
if ( k != kh_end(stats->rg_hash) )
- fprintf(pysamerr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
+ fprintf(pysam_stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
int ret;
k = kh_put(kh_rg, stats->rg_hash, key, &ret);
kh_value(stats->rg_hash, k) = val;
if ( !n )
error("The sample or read group \"%s\" not present.\n", id);
#else
- fprintf(pysamerr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n");
+ fprintf(pysam_stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n");
abort();
#endif
}
{
if ( !format )
{
- printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamstats.\n");
- printf("Usage: samtools stats [OPTIONS] file.bam\n");
- printf(" samtools stats [OPTIONS] file.bam chr:from-to\n");
- printf("Options:\n");
- printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
- printf(" -d, --remove-dups Exclude from statistics reads marked as duplicates\n");
- printf(" -f, --required-flag <str|int> Required flag, 0 for unset. See also `samtools flags` [0]\n");
- printf(" -F, --filtering-flag <str|int> Filtering flag, 0 for unset. See also `samtools flags` [0]\n");
- printf(" --GC-depth <float> the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n");
- printf(" -h, --help This help message\n");
- printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
- printf(" -I, --id <string> Include only listed read group or sample name\n");
- printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
- printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
- printf(" -P, --split-prefix <str> Path or string prefix for filepaths output by -S (default is input filename)\n");
- printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
- printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n");
- printf(" -s, --sam Ignored (input format is auto-detected).\n");
- printf(" -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
- printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
- printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
- sam_global_opt_help(stdout, "-.--.");
- printf("\n");
+ fprintf(pysam_stdout, "About: The program collects statistics from BAM files. The output can be visualized using plot-bamstats.\n");
+ fprintf(pysam_stdout, "Usage: samtools stats [OPTIONS] file.bam\n");
+ fprintf(pysam_stdout, " samtools stats [OPTIONS] file.bam chr:from-to\n");
+ fprintf(pysam_stdout, "Options:\n");
+ fprintf(pysam_stdout, " -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
+ fprintf(pysam_stdout, " -d, --remove-dups Exclude from statistics reads marked as duplicates\n");
+ fprintf(pysam_stdout, " -f, --required-flag <str|int> Required flag, 0 for unset. See also `samtools flags` [0]\n");
+ fprintf(pysam_stdout, " -F, --filtering-flag <str|int> Filtering flag, 0 for unset. See also `samtools flags` [0]\n");
+ fprintf(pysam_stdout, " --GC-depth <float> the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n");
+ fprintf(pysam_stdout, " -h, --help This help message\n");
+ fprintf(pysam_stdout, " -i, --insert-size <int> Maximum insert size [8000]\n");
+ fprintf(pysam_stdout, " -I, --id <string> Include only listed read group or sample name\n");
+ fprintf(pysam_stdout, " -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
+ fprintf(pysam_stdout, " -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
+ fprintf(pysam_stdout, " -P, --split-prefix <str> Path or string prefix for filepaths output by -S (default is input filename)\n");
+ fprintf(pysam_stdout, " -q, --trim-quality <int> The BWA trimming parameter [0]\n");
+ fprintf(pysam_stdout, " -r, --ref-seq <file> Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n");
+ fprintf(pysam_stdout, " -s, --sam Ignored (input format is auto-detected).\n");
+ fprintf(pysam_stdout, " -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
+ fprintf(pysam_stdout, " -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
+ fprintf(pysam_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
+ sam_global_opt_help(pysam_stdout, "-.--.");
+ fprintf(pysam_stdout, "\n");
}
else
{
va_list ap;
va_start(ap, format);
- vfprintf(pysamerr, format, ap);
+ vfprintf(pysam_stderr, format, ap);
va_end(ap);
}
exit(1);
}
if (ret < -1) {
- fprintf(pysamerr, "Failure while decoding file\n");
+ fprintf(pysam_stderr, "Failure while decoding file\n");
return 1;
}
}
round_buffer_flush(all_stats, -1);
- output_stats(stdout, all_stats, sparse);
+ output_stats(pysam_stdout, all_stats, sparse);
if (info->split_tag)
output_split_stats(split_hash, bam_fname, sparse);
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include "stats_isize.h"
#include <htslib/khash.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <stdio.h>
#include "stats_isize.h"
#include <htslib/khash.h>
kh_value(h, it) = rec;
a->max = max(at, a->max);
} else {
- fprintf(pysamerr, "%s\n", "Failed to allocate memory for isize_sparse_record_t");
+ fprintf(pysam_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t");
exit(11);
}
} else {
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_sort.c"
#include "../test.h"
#include <stdio.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_sort.c"
#include "../test.h"
#include <stdio.h>
#include <unistd.h>
void dump_read(bam1_t* b) {
- printf("->core.tid:(%d)\n", b->core.tid);
- printf("->core.pos:(%d)\n", b->core.pos);
- printf("->core.bin:(%d)\n", b->core.bin);
- printf("->core.qual:(%d)\n", b->core.qual);
- printf("->core.l_qname:(%d)\n", b->core.l_qname);
- printf("->core.flag:(%d)\n", b->core.flag);
- printf("->core.n_cigar:(%d)\n", b->core.n_cigar);
- printf("->core.l_qseq:(%d)\n", b->core.l_qseq);
- printf("->core.mtid:(%d)\n", b->core.mtid);
- printf("->core.mpos:(%d)\n", b->core.mpos);
- printf("->core.isize:(%d)\n", b->core.isize);
+ fprintf(pysam_stdout, "->core.tid:(%d)\n", b->core.tid);
+ fprintf(pysam_stdout, "->core.pos:(%d)\n", b->core.pos);
+ fprintf(pysam_stdout, "->core.bin:(%d)\n", b->core.bin);
+ fprintf(pysam_stdout, "->core.qual:(%d)\n", b->core.qual);
+ fprintf(pysam_stdout, "->core.l_qname:(%d)\n", b->core.l_qname);
+ fprintf(pysam_stdout, "->core.flag:(%d)\n", b->core.flag);
+ fprintf(pysam_stdout, "->core.n_cigar:(%d)\n", b->core.n_cigar);
+ fprintf(pysam_stdout, "->core.l_qseq:(%d)\n", b->core.l_qseq);
+ fprintf(pysam_stdout, "->core.mtid:(%d)\n", b->core.mtid);
+ fprintf(pysam_stdout, "->core.mpos:(%d)\n", b->core.mpos);
+ fprintf(pysam_stdout, "->core.isize:(%d)\n", b->core.isize);
if (b->data) {
- printf("->data:");
+ fprintf(pysam_stdout, "->data:");
int i;
for (i = 0; i < b->l_data; ++i) {
- printf("%x ", b->data[i]);
+ fprintf(pysam_stdout, "%x ", b->data[i]);
}
- printf("\n");
+ fprintf(pysam_stdout, "\n");
}
if (b->core.l_qname) {
- printf("qname: %s\n",bam_get_qname(b));
+ fprintf(pysam_stdout, "qname: %s\n",bam_get_qname(b));
}
if (b->core.l_qseq) {
- printf("qseq:");
+ fprintf(pysam_stdout, "qseq:");
int i;
for (i = 0; i < b->core.l_qseq; ++i) {
- printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]);
+ fprintf(pysam_stdout, "%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]);
}
- printf("\n");
- printf("qual:");
+ fprintf(pysam_stdout, "\n");
+ fprintf(pysam_stdout, "qual:");
for (i = 0; i < b->core.l_qseq; ++i) {
- printf("%c",bam_get_qual(b)[i]);
+ fprintf(pysam_stdout, "%c",bam_get_qual(b)[i]);
}
- printf("\n");
+ fprintf(pysam_stdout, "\n");
}
uint8_t* aux = bam_get_aux(b);
while (i < bam_get_l_aux(b)) {
- printf("%.2s:%c:",aux+i,*(aux+i+2));
+ fprintf(pysam_stdout, "%.2s:%c:",aux+i,*(aux+i+2));
i += 2;
switch (*(aux+i)) {
case 'Z':
- while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; }
+ while (*(aux+1+i) != '\0') { putc(*(aux+1+i), pysam_stdout); ++i; }
break;
}
- putc('\n',stdout);
+ putc('\n',pysam_stdout);
++i;++i;
}
}
- printf("\n");
+ fprintf(pysam_stdout, "\n");
}
void trans_tbl_test_init(trans_tbl_t* tbl, int32_t n_targets)
}
-int main(int argc, char**argv)
+int samtools_test_bam_translate_main(int argc, char**argv)
{
// test state
const int NUM_TESTS = 6;
bam1_t* b;
- // Setup pysamerr redirect
+ // Setup pysam_stderr redirect
kstring_t res = { 0, 0, NULL };
- FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
+ FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr
char* tempfname = (optind < argc)? argv[optind] : "test_bam_translate.tmp";
FILE* check = NULL;
// setup
- if (verbose) printf("BEGIN test 1\n"); // TID test
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // TID test
trans_tbl_t tbl1;
setup_test_1(&b,&tbl1);
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
- if (verbose) printf("RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 1\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bam_translate(b, &tbl1);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 1\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 1\n");
}
fclose(check);
// teardown
bam_destroy1(b);
trans_tbl_destroy(&tbl1);
- if (verbose) printf("END test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END test 1\n");
// setup
- if (verbose) printf("BEGIN test 2\n"); // RG exists and translate test
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // RG exists and translate test
trans_tbl_t tbl2;
setup_test_2(&b,&tbl2);
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
- if (verbose) printf("RUN test 2\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 2\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bam_translate(b, &tbl2);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 2\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 2\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 2\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 2\n");
}
fclose(check);
// teardown
bam_destroy1(b);
trans_tbl_destroy(&tbl2);
- if (verbose) printf("END test 2\n");
+ if (verbose) fprintf(pysam_stdout, "END test 2\n");
- if (verbose) printf("BEGIN test 3\n"); // PG exists and translate test
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 3\n"); // PG exists and translate test
// setup
trans_tbl_t tbl3;
setup_test_3(&b,&tbl3);
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
- if (verbose) printf("RUN test 3\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 3\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bam_translate(b, &tbl3);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 3\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 3\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 3\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 3\n");
}
fclose(check);
// teardown
bam_destroy1(b);
trans_tbl_destroy(&tbl3);
- if (verbose) printf("END test 3\n");
+ if (verbose) fprintf(pysam_stdout, "END test 3\n");
- if (verbose) printf("BEGIN test 4\n"); // RG test non-existent
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 4\n"); // RG test non-existent
// setup
trans_tbl_t tbl4;
setup_test_4(&b,&tbl4);
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
- if (verbose) printf("RUN test 4\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 4\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bam_translate(b, &tbl4);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 4\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 4\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
// check result
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 4\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 4\n");
}
fclose(check);
// teardown
bam_destroy1(b);
trans_tbl_destroy(&tbl4);
- if (verbose) printf("END test 4\n");
+ if (verbose) fprintf(pysam_stdout, "END test 4\n");
- if (verbose) printf("BEGIN test 5\n"); // PG test non-existent
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 5\n"); // PG test non-existent
// setup
trans_tbl_t tbl5;
setup_test_5(&b,&tbl5);
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
- printf("RUN test 5\n");
+ fprintf(pysam_stdout, "RUN test 5\n");
}
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bam_translate(b, &tbl5);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 5\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 5\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 5\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 5\n");
}
fclose(check);
// teardown
bam_destroy1(b);
trans_tbl_destroy(&tbl5);
- if (verbose) printf("END test 5\n");
+ if (verbose) fprintf(pysam_stdout, "END test 5\n");
- if (verbose) printf("BEGIN test 6\n"); // RG and PG exists and translate test
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 6\n"); // RG and PG exists and translate test
// setup
trans_tbl_t tbl6;
setup_test_6(&b,&tbl6);
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
- if (verbose) printf("RUN test 6\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 6\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bam_translate(b, &tbl6);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 6\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 6\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_read(b);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 6\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 6\n");
}
fclose(check);
// teardown
bam_destroy1(b);
trans_tbl_destroy(&tbl6);
- if (verbose) printf("END test 6\n");
+ if (verbose) fprintf(pysam_stdout, "END test 6\n");
// Cleanup
free(res.s);
remove(tempfname);
if (failure > 0)
- fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
- fclose(orig_pysamerr);
+ fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
+ fclose(orig_pysam_stderr);
return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_sort.c"
void dump_rtrans(int* rtrans, int n, int n_targets) {
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_sort.c"
void dump_rtrans(int* rtrans, int n, int n_targets) {
- printf("->n_targets:(%d)\n", n_targets);
+ fprintf(pysam_stdout, "->n_targets:(%d)\n", n_targets);
int i, j;
for (i = 0; i < n; ++i) {
- fprintf(pysamerr, "%d",rtrans[i*n_targets+0]);
+ fprintf(pysam_stderr, "%d",rtrans[i*n_targets+0]);
for (j = 1; j < n_targets; ++j)
- fprintf(pysamerr, "\t%d",rtrans[i*n_targets+j]);
- fprintf(pysamerr, "\n");
+ fprintf(pysam_stderr, "\t%d",rtrans[i*n_targets+j]);
+ fprintf(pysam_stderr, "\n");
}
}
}
-int main(int argc, char**argv)
+int samtools_test_rtrans_build_main(int argc, char**argv)
{
const int NUM_TESTS = 1;
int verbose = 0;
const long GIMMICK_SEED = 0x1234330e;
srand48(GIMMICK_SEED);
- if (verbose) printf("BEGIN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n");
// setup
trans_tbl_t tbl_1[2];
int n_targets_1 = 3;
if (verbose > 1) {
// dump_trans_tid
}
- if (verbose) printf("RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 1\n");
rtrans_1 = rtrans_build(n_1, n_targets_1, &tbl_1[0]);
- if (verbose) printf("END RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- printf("rtrans\n");
+ fprintf(pysam_stdout, "rtrans\n");
dump_rtrans(rtrans_1, n_1, n_targets_1);
}
if (check_test_1(&tbl_1[0], rtrans_1)) {
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 1\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 1\n");
}
// teardown
trans_tbl_destroy(&tbl_1[0]);
trans_tbl_destroy(&tbl_1[1]);
free(rtrans_1);
- if (verbose) printf("END test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END test 1\n");
if (success == NUM_TESTS) {
return 0;
} else {
- fprintf(pysamerr, "%d failures %d successes\n", failure, success);
+ fprintf(pysam_stderr, "%d failures %d successes\n", failure, success);
return 1;
}
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_sort.c"
#include <assert.h>
#include <regex.h>
static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) {
trans_tbl_t dummy;
int res;
- res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, NULL);
+ res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL);
trans_tbl_destroy(&dummy);
return res;
}
dump_header(translate);
}
if (verbose) printf("RUN test 1\n");
- trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, NULL);
+ trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 1\n");
dump_header(translate);
}
if (verbose) printf("RUN test 2\n");
- trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, NULL);
+ trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 2\n");
dump_header(translate);
}
if (verbose) printf("RUN test 3\n");
- trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, NULL);
+ trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 3\n");
dump_header(translate);
}
if (verbose) printf("RUN test 4\n");
- trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, NULL);
+ trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 4\n");
dump_header(translate);
}
if (verbose) printf("RUN test 5\n");
- trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, NULL);
+ trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 5\n");
dump_header(translate);
}
if (verbose) printf("RUN test 6\n");
- trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, "filename");
+ trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename");
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
if (verbose) printf("END RUN test 6\n");
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_sort.c"
#include <assert.h>
#include <regex.h>
} refseq_info_t;
void dump_header(bam_hdr_t* hdr) {
- printf("->n_targets:(%d)\n", hdr->n_targets);
+ fprintf(pysam_stdout, "->n_targets:(%d)\n", hdr->n_targets);
int i;
for (i = 0; i < hdr->n_targets; ++i) {
- printf("->target_name[%d]:(%s)\n",i,hdr->target_name[i]);
- printf("->target_len[%d]:(%d)\n",i,hdr->target_len[i]);
+ fprintf(pysam_stdout, "->target_name[%d]:(%s)\n",i,hdr->target_name[i]);
+ fprintf(pysam_stdout, "->target_len[%d]:(%d)\n",i,hdr->target_len[i]);
}
- printf("->text:(");
- fwrite((void*)hdr->text, (size_t) hdr->l_text, 1, stdout);
- printf(")\n");
+ fprintf(pysam_stdout, "->text:(");
+ fwrite((void*)hdr->text, (size_t) hdr->l_text, 1, pysam_stdout);
+ fprintf(pysam_stdout, ")\n");
}
static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) {
trans_tbl_t dummy;
int res;
- res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, NULL);
+ res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL);
trans_tbl_destroy(&dummy);
return res;
}
return true;
}
-int main(int argc, char**argv)
+int samtools_test_trans_tbl_init_main(int argc, char**argv)
{
const int NUM_TESTS = 6;
int verbose = 0;
bam_hdr_t* out;
bam_hdr_t* translate;
- if (verbose) printf("BEGIN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n");
// setup
trans_tbl_t tbl_1;
merged_header_t *merged_hdr = init_merged_header();
assert(translate);
// test
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
}
- if (verbose) printf("RUN test 1\n");
- trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, NULL);
+ if (verbose) fprintf(pysam_stdout, "RUN test 1\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
- if (verbose) printf("END RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
- printf("out\n");
+ fprintf(pysam_stdout, "out\n");
dump_header(out);
}
if (check_test_1(translate, out, &tbl_1)) {
- if (verbose) printf("Test 1 : PASS\n");
+ if (verbose) fprintf(pysam_stdout, "Test 1 : PASS\n");
++success;
} else {
- if (verbose) printf("Test 1 : FAIL\n");
- fprintf(pysamerr, "Test 1 : FAIL\n");
+ if (verbose) fprintf(pysam_stdout, "Test 1 : FAIL\n");
+ fprintf(pysam_stderr, "Test 1 : FAIL\n");
++failure;
}
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_1);
- if (verbose) printf("END test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END test 1\n");
// test
- if (verbose) printf("BEGIN test 2\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n");
// reinit
trans_tbl_t tbl_2;
translate = setup_test_2(merged_hdr);
assert(translate);
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
}
- if (verbose) printf("RUN test 2\n");
- trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, NULL);
+ if (verbose) fprintf(pysam_stdout, "RUN test 2\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
- if (verbose) printf("END RUN test 2\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 2\n");
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
- printf("out\n");
+ fprintf(pysam_stdout, "out\n");
dump_header(out);
}
if (check_test_2(translate, out, &tbl_2)) {
- if (verbose) printf("Test 2 : PASS\n");
+ if (verbose) fprintf(pysam_stdout, "Test 2 : PASS\n");
++success;
} else {
- if (verbose) printf("Test 2 : FAIL\n");
- fprintf(pysamerr, "Test 2 : FAIL\n");
+ if (verbose) fprintf(pysam_stdout, "Test 2 : FAIL\n");
+ fprintf(pysam_stderr, "Test 2 : FAIL\n");
++failure;
}
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_2);
- if (verbose) printf("END test 2\n");
+ if (verbose) fprintf(pysam_stdout, "END test 2\n");
// test
- if (verbose) printf("BEGIN test 3\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 3\n");
// reinit
trans_tbl_t tbl_3;
merged_hdr = init_merged_header();
translate = setup_test_3(merged_hdr);
assert(translate);
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
}
- if (verbose) printf("RUN test 3\n");
- trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, NULL);
+ if (verbose) fprintf(pysam_stdout, "RUN test 3\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
- if (verbose) printf("END RUN test 3\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 3\n");
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
- printf("out\n");
+ fprintf(pysam_stdout, "out\n");
dump_header(out);
}
if (check_test_3(translate, out, &tbl_3)) {
- if (verbose) printf("Test 3 : PASS\n");
+ if (verbose) fprintf(pysam_stdout, "Test 3 : PASS\n");
++success;
} else {
- if (verbose) printf("Test 3 : FAIL\n");
- fprintf(pysamerr, "Test 3 : FAIL\n");
+ if (verbose) fprintf(pysam_stdout, "Test 3 : FAIL\n");
+ fprintf(pysam_stderr, "Test 3 : FAIL\n");
++failure;
}
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_3);
- if (verbose) printf("END test 3\n");
+ if (verbose) fprintf(pysam_stdout, "END test 3\n");
// test
- if (verbose) printf("BEGIN test 4\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 4\n");
// reinit
trans_tbl_t tbl_4;
merged_hdr = init_merged_header();
translate = setup_test_4(merged_hdr);
assert(translate);
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
}
- if (verbose) printf("RUN test 4\n");
- trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, NULL);
+ if (verbose) fprintf(pysam_stdout, "RUN test 4\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
- if (verbose) printf("END RUN test 4\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 4\n");
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
- printf("out\n");
+ fprintf(pysam_stdout, "out\n");
dump_header(out);
}
if (check_test_4(translate, out, &tbl_4)) {
- if (verbose) printf("Test 4 : PASS\n");
+ if (verbose) fprintf(pysam_stdout, "Test 4 : PASS\n");
++success;
} else {
- if (verbose) printf("Test 4 : FAIL\n");
- fprintf(pysamerr, "Test 4 : FAIL\n");
+ if (verbose) fprintf(pysam_stdout, "Test 4 : FAIL\n");
+ fprintf(pysam_stderr, "Test 4 : FAIL\n");
++failure;
}
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_4);
- if (verbose) printf("END test 4\n");
+ if (verbose) fprintf(pysam_stdout, "END test 4\n");
// test
- if (verbose) printf("BEGIN test 5\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 5\n");
// reinit
trans_tbl_t tbl_5;
merged_hdr = init_merged_header();
assert(translate);
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
}
- if (verbose) printf("RUN test 5\n");
- trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, NULL);
+ if (verbose) fprintf(pysam_stdout, "RUN test 5\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL);
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
- if (verbose) printf("END RUN test 5\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 5\n");
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
- printf("out\n");
+ fprintf(pysam_stdout, "out\n");
dump_header(out);
}
if (check_test_5(translate, out, &tbl_5)) {
- if (verbose) printf("Test 5 : PASS\n");
+ if (verbose) fprintf(pysam_stdout, "Test 5 : PASS\n");
++success;
} else {
- if (verbose) printf("Test 5 : FAIL\n");
- fprintf(pysamerr, "Test 5 : FAIL\n");
+ if (verbose) fprintf(pysam_stdout, "Test 5 : FAIL\n");
+ fprintf(pysam_stderr, "Test 5 : FAIL\n");
++failure;
}
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_5);
- if (verbose) printf("END test 5\n");
+ if (verbose) fprintf(pysam_stdout, "END test 5\n");
// test
- if (verbose) printf("BEGIN test 6\n");
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 6\n");
// reinit
trans_tbl_t tbl_6;
merged_hdr = init_merged_header();
translate = setup_test_6(merged_hdr);
assert(translate);
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
}
- if (verbose) printf("RUN test 6\n");
- trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, "filename");
+ if (verbose) fprintf(pysam_stdout, "RUN test 6\n");
+ trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename");
out = finish_merged_header(merged_hdr);
free_merged_header(merged_hdr);
- if (verbose) printf("END RUN test 6\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 6\n");
if (verbose > 1) {
- printf("translate\n");
+ fprintf(pysam_stdout, "translate\n");
dump_header(translate);
- printf("out\n");
+ fprintf(pysam_stdout, "out\n");
dump_header(out);
}
if (check_test_6(translate, out, &tbl_6)) {
- if (verbose) printf("Test 6 : PASS\n");
+ if (verbose) fprintf(pysam_stdout, "Test 6 : PASS\n");
++success;
} else {
- if (verbose) printf("Test 6 : FAIL\n");
- fprintf(pysamerr, "Test 6 : FAIL\n");
+ if (verbose) fprintf(pysam_stdout, "Test 6 : FAIL\n");
+ fprintf(pysam_stderr, "Test 6 : FAIL\n");
++failure;
}
// teardown
bam_hdr_destroy(translate);
bam_hdr_destroy(out);
trans_tbl_destroy(&tbl_6);
- if (verbose) printf("END test 6\n");
+ if (verbose) fprintf(pysam_stdout, "END test 6\n");
if (success == NUM_TESTS) {
return 0;
} else {
- fprintf(pysamerr, "%d failures %d successes\n", failure, success);
+ fprintf(pysam_stderr, "%d failures %d successes\n", failure, success);
return 1;
}
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <stdlib.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <stdlib.h>
(*hdr_in)->l_text = strlen(test1);
}
-int main(int argc, char**argv)
+int samtools_test_count_rg_main(int argc, char**argv)
{
// test state
const int NUM_TESTS = 1;
++verbose;
break;
default:
- printf(
+ fprintf(pysam_stdout,
"usage: test_count_rg [-v]\n\n"
" -v verbose output\n"
);
}
- // Setup pysamerr redirect
+ // Setup pysam_stderr redirect
kstring_t res = { 0, 0, NULL };
- FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
+ FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr
char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp";
FILE* check = NULL;
// setup
- if (verbose) printf("BEGIN test 1\n"); // TID test
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // TID test
bam_hdr_t* hdr1;
size_t count;
char** output;
setup_test_1(&hdr1);
if (verbose > 1) {
- printf("hdr1\n");
+ fprintf(pysam_stdout, "hdr1\n");
dump_hdr(hdr1);
}
- if (verbose) printf("RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 1\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bool result_1 = count_RG(hdr1, &count, &output);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- printf("b\n");
+ fprintf(pysam_stdout, "b\n");
dump_hdr(hdr1);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 1\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 1\n");
}
fclose(check);
}
free(output);
bam_hdr_destroy(hdr1);
- if (verbose) printf("END test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END test 1\n");
// Cleanup
free(res.s);
remove(tempfname);
if (failure > 0)
- fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
- fclose(orig_pysamerr);
+ fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
+ fclose(orig_pysam_stderr);
return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <stdlib.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <stdlib.h>
(*hdr_in)->l_text = strlen(test1);
}
-int main(int argc, char**argv)
+int samtools_test_expand_format_string_main(int argc, char**argv)
{
// test state
const int NUM_TESTS = 1;
++verbose;
break;
default:
- printf(
+ fprintf(pysam_stdout,
"usage: test_expand_format_string [-v]\n\n"
" -v verbose output\n"
);
}
- // Setup pysamerr redirect
+ // Setup pysam_stderr redirect
kstring_t res = { 0, 0, NULL };
- FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
+ FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr
char* tempfname = (optind < argc)? argv[optind] : "test_expand_format_string.tmp";
FILE* check = NULL;
// setup
- if (verbose) printf("BEGIN test 1\n"); // default format string test
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // default format string test
const char* format_string_1 = "%*_%#.bam";
const char* basename_1 = "basename";
const char* rg_id_1 = "1#2.3";
const int rg_idx_1 = 4;
if (verbose > 1) {
- printf("format_string:%s\n"
+ fprintf(pysam_stdout, "format_string:%s\n"
"basename:%s\n"
"rg_id:%s\n"
"rg_idx:%d\n", format_string_1, basename_1, rg_id_1, rg_idx_1);
}
- if (verbose) printf("RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 1\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1, NULL);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- printf("format_string:%s\n"
+ fprintf(pysam_stdout, "format_string:%s\n"
"basename:%s\n"
"rg_id:%s\n"
"rg_idx:%d\n", format_string_1, basename_1, rg_id_1, rg_idx_1);
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 1\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 1\n");
}
fclose(check);
// teardown
free(output_1);
- if (verbose) printf("END test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END test 1\n");
// Cleanup test harness
free(res.s);
remove(tempfname);
if (failure > 0)
- fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
- fclose(orig_pysamerr);
+ fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
+ fclose(orig_pysam_stderr);
return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <unistd.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <unistd.h>
return true;
}
-int main(int argc, char**argv)
+int samtools_test_filter_header_rg_main(int argc, char**argv)
{
// test state
const int NUM_TESTS = 2;
++verbose;
break;
default:
- printf(
+ fprintf(pysam_stdout,
"usage: test_filter_header_rg [-v]\n\n"
" -v verbose output\n"
);
}
- // Setup pysamerr redirect
+ // Setup pysam_stderr redirect
kstring_t res = { 0, 0, NULL };
- FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
+ FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr
char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp";
FILE* check = NULL;
// setup
- if (verbose) printf("BEGIN test 1\n"); // test eliminating a tag that isn't there
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // test eliminating a tag that isn't there
bam_hdr_t* hdr1;
const char* id_to_keep_1 = "1#2.3";
setup_test_1(&hdr1);
if (verbose > 1) {
- printf("hdr1\n");
+ fprintf(pysam_stdout, "hdr1\n");
dump_hdr(hdr1);
}
- if (verbose) printf("RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 1\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bool result_1 = filter_header_rg(hdr1, id_to_keep_1);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- printf("hdr1\n");
+ fprintf(pysam_stdout, "hdr1\n");
dump_hdr(hdr1);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 1\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 1\n");
}
fclose(check);
// teardown
bam_hdr_destroy(hdr1);
- if (verbose) printf("END test 1\n");
+ if (verbose) fprintf(pysam_stdout, "END test 1\n");
- if (verbose) printf("BEGIN test 2\n"); // test eliminating a tag that is there
+ if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there
bam_hdr_t* hdr2;
const char* id_to_keep_2 = "fish";
setup_test_2(&hdr2);
if (verbose > 1) {
- printf("hdr2\n");
+ fprintf(pysam_stdout, "hdr2\n");
dump_hdr(hdr2);
}
- if (verbose) printf("RUN test 2\n");
+ if (verbose) fprintf(pysam_stdout, "RUN test 2\n");
// test
- xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
bool result_2 = filter_header_rg(hdr2, id_to_keep_2);
- fclose(pysamerr);
+ fclose(pysam_stderr);
- if (verbose) printf("END RUN test 2\n");
+ if (verbose) fprintf(pysam_stdout, "END RUN test 2\n");
if (verbose > 1) {
- printf("hdr2\n");
+ fprintf(pysam_stdout, "hdr2\n");
dump_hdr(hdr2);
}
++success;
} else {
++failure;
- if (verbose) printf("FAIL test 2\n");
+ if (verbose) fprintf(pysam_stdout, "FAIL test 2\n");
}
fclose(check);
// teardown
bam_hdr_destroy(hdr2);
- if (verbose) printf("END test 2\n");
+ if (verbose) fprintf(pysam_stdout, "END test 2\n");
// Cleanup
free(res.s);
remove(tempfname);
if (failure > 0)
- fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
- fclose(orig_pysamerr);
+ fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
+ fclose(orig_pysam_stderr);
return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <stdlib.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_split.c"
#include "../test.h"
#include <stdlib.h>
return true;
}
-int main(int argc, char**argv)
+int samtools_test_parse_args_main(int argc, char**argv)
{
// test state
const int NUM_TESTS = 2;
++verbose;
break;
default:
- printf(
+ fprintf(pysam_stdout,
"usage: test_parse_args [-v]\n\n"
" -v verbose output\n"
);
}
}
- // Setup stdout and pysamerr redirect
- kstring_t res_stdout = { 0, 0, NULL };
- kstring_t res_pysamerr = { 0, 0, NULL };
- FILE* orig_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save pysamerr
- FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr
- char* tempfname_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o";
- char* tempfname_pysamerr = (optind < argc)? argv[optind] : "test_parse_args.tmp.e";
- FILE* check_stdout = NULL;
- FILE* check_pysamerr = NULL;
+ // Setup pysam_stdout and pysam_stderr redirect
+ kstring_t res_pysam_stdout = { 0, 0, NULL };
+ kstring_t res_pysam_stderr = { 0, 0, NULL };
+ FILE* orig_pysam_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save pysam_stderr
+ FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr
+ char* tempfname_pysam_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o";
+ char* tempfname_pysam_stderr = (optind < argc)? argv[optind] : "test_parse_args.tmp.e";
+ FILE* check_pysam_stdout = NULL;
+ FILE* check_pysam_stderr = NULL;
// Cleanup getopt
optind = 1;
// setup
- if (verbose) fprintf(orig_stdout,"BEGIN test 1\n"); // test eliminating a tag that isn't there
+ if (verbose) fprintf(orig_pysam_stdout,"BEGIN test 1\n"); // test eliminating a tag that isn't there
int argc_1;
char** argv_1;
setup_test_1(&argc_1, &argv_1);
if (verbose > 1) {
- fprintf(orig_stdout, "argc: %d\n", argc_1);
+ fprintf(orig_pysam_stdout, "argc: %d\n", argc_1);
}
- if (verbose) fprintf(orig_stdout,"RUN test 1\n");
+ if (verbose) fprintf(orig_pysam_stdout,"RUN test 1\n");
// test
- xfreopen(tempfname_stdout, "w", stdout); // Redirect stdout to pipe
- xfreopen(tempfname_pysamerr, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname_pysam_stdout, "w", pysam_stdout); // Redirect pysam_stdout to pipe
+ xfreopen(tempfname_pysam_stderr, "w", pysam_stderr); // Redirect pysam_stderr to pipe
parsed_opts_t* result_1 = parse_args(argc_1, argv_1);
- fclose(stdout);
- fclose(pysamerr);
+ fclose(pysam_stdout);
+ fclose(pysam_stderr);
- if (verbose) fprintf(orig_stdout, "END RUN test 1\n");
+ if (verbose) fprintf(orig_pysam_stdout, "END RUN test 1\n");
if (verbose > 1) {
- fprintf(orig_stdout, "argc: %d\n", argc_1);
+ fprintf(orig_pysam_stdout, "argc: %d\n", argc_1);
}
// check result
- res_stdout.l = res_pysamerr.l = 0;
- check_stdout = fopen(tempfname_stdout, "r");
- check_pysamerr = fopen(tempfname_pysamerr, "r");
+ res_pysam_stdout.l = res_pysam_stderr.l = 0;
+ check_pysam_stdout = fopen(tempfname_pysam_stdout, "r");
+ check_pysam_stderr = fopen(tempfname_pysam_stderr, "r");
if ( !result_1
- && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) >= 0
- && !feof(check_stdout)
- && res_stdout.l > 0
- && kgetline(&res_pysamerr, (kgets_func *)fgets, check_pysamerr) < 0
- && (feof(check_pysamerr) || res_pysamerr.l == 0)) {
+ && kgetline(&res_pysam_stdout, (kgets_func *)fgets, check_pysam_stdout) >= 0
+ && !feof(check_pysam_stdout)
+ && res_pysam_stdout.l > 0
+ && kgetline(&res_pysam_stderr, (kgets_func *)fgets, check_pysam_stderr) < 0
+ && (feof(check_pysam_stderr) || res_pysam_stderr.l == 0)) {
++success;
} else {
++failure;
- if (verbose) fprintf(orig_stdout, "FAIL test 1\n");
+ if (verbose) fprintf(orig_pysam_stdout, "FAIL test 1\n");
}
- fclose(check_pysamerr);
- fclose(check_stdout);
+ fclose(check_pysam_stderr);
+ fclose(check_pysam_stdout);
// teardown
cleanup_opts(result_1);
free(argv_1[i]);
}
free(argv_1);
- if (verbose) fprintf(orig_stdout, "END test 1\n");
+ if (verbose) fprintf(orig_pysam_stdout, "END test 1\n");
// Cleanup getopt
optind = 1;
- if (verbose) fprintf(orig_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there
+ if (verbose) fprintf(orig_pysam_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there
int argc_2;
char** argv_2;
setup_test_2(&argc_2, &argv_2);
if (verbose > 1) {
- fprintf(orig_stdout, "argc: %d\n", argc_2);
+ fprintf(orig_pysam_stdout, "argc: %d\n", argc_2);
}
- if (verbose) fprintf(orig_stdout, "RUN test 2\n");
+ if (verbose) fprintf(orig_pysam_stdout, "RUN test 2\n");
// test
- xfreopen(tempfname_stdout, "w", stdout); // Redirect stdout to pipe
- xfreopen(tempfname_pysamerr, "w", pysamerr); // Redirect pysamerr to pipe
+ xfreopen(tempfname_pysam_stdout, "w", pysam_stdout); // Redirect pysam_stdout to pipe
+ xfreopen(tempfname_pysam_stderr, "w", pysam_stderr); // Redirect pysam_stderr to pipe
parsed_opts_t* result_2 = parse_args(argc_2, argv_2);
- fclose(stdout);
- fclose(pysamerr);
+ fclose(pysam_stdout);
+ fclose(pysam_stderr);
- if (verbose) fprintf(orig_stdout, "END RUN test 2\n");
+ if (verbose) fprintf(orig_pysam_stdout, "END RUN test 2\n");
if (verbose > 1) {
- fprintf(orig_stdout, "argc: %d\n", argc_2);
+ fprintf(orig_pysam_stdout, "argc: %d\n", argc_2);
}
// check result
- res_stdout.l = res_pysamerr.l = 0;
- check_stdout = fopen(tempfname_stdout, "r");
- check_pysamerr = fopen(tempfname_pysamerr, "r");
+ res_pysam_stdout.l = res_pysam_stderr.l = 0;
+ check_pysam_stdout = fopen(tempfname_pysam_stdout, "r");
+ check_pysam_stderr = fopen(tempfname_pysam_stderr, "r");
if ( result_2
&& check_test_2(result_2)
- && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) < 0
- && (feof(check_stdout) || res_stdout.l == 0)
- && kgetline(&res_pysamerr, (kgets_func *)fgets, check_pysamerr) < 0
- && (feof(check_pysamerr) || res_pysamerr.l == 0)) {
+ && kgetline(&res_pysam_stdout, (kgets_func *)fgets, check_pysam_stdout) < 0
+ && (feof(check_pysam_stdout) || res_pysam_stdout.l == 0)
+ && kgetline(&res_pysam_stderr, (kgets_func *)fgets, check_pysam_stderr) < 0
+ && (feof(check_pysam_stderr) || res_pysam_stderr.l == 0)) {
++success;
} else {
++failure;
- if (verbose) fprintf(orig_stdout, "FAIL test 2\n");
+ if (verbose) fprintf(orig_pysam_stdout, "FAIL test 2\n");
}
- fclose(check_stdout);
- fclose(check_pysamerr);
+ fclose(check_pysam_stdout);
+ fclose(check_pysam_stderr);
// teardown
cleanup_opts(result_2);
}
free(argv_2);
- if (verbose) fprintf(orig_stdout, "END test 2\n");
+ if (verbose) fprintf(orig_pysam_stdout, "END test 2\n");
// Cleanup
- free(res_stdout.s);
- free(res_pysamerr.s);
- remove(tempfname_stdout);
- remove(tempfname_pysamerr);
- fclose(orig_stdout);
+ free(res_pysam_stdout.s);
+ free(res_pysam_stderr.s);
+ remove(tempfname_pysam_stdout);
+ remove(tempfname_pysam_stderr);
+ fclose(orig_pysam_stdout);
if (failure > 0)
- fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success);
- fclose(orig_pysamerr);
+ fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
+ fclose(orig_pysam_stderr);
return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE;
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
void xfreopen(const char *path, const char *mode, FILE *stream)
{
if (freopen(path, mode, stream) == NULL) {
- fprintf(pysamerr, __FILE__": error reopening %s: %s\n",
+ fprintf(pysam_stderr, __FILE__": error reopening %s: %s\n",
path, strerror(errno));
exit(2);
}
void dump_hdr(const bam_hdr_t* hdr)
{
- printf("n_targets: %d\n", hdr->n_targets);
- printf("ignore_sam_err: %d\n", hdr->ignore_sam_err);
- printf("l_text: %u\n", hdr->l_text);
- printf("idx\ttarget_len\ttarget_name:\n");
+ fprintf(pysam_stdout, "n_targets: %d\n", hdr->n_targets);
+ fprintf(pysam_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err);
+ fprintf(pysam_stdout, "l_text: %u\n", hdr->l_text);
+ fprintf(pysam_stdout, "idx\ttarget_len\ttarget_name:\n");
int32_t target;
for (target = 0; target < hdr->n_targets; ++target) {
- printf("%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]);
+ fprintf(pysam_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]);
}
- printf("text: \"%s\"\n", hdr->text);
+ fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text);
}
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_tview.c"
#include <stdbool.h>
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
+#include <config.h>
+
#include "../../bam_tview.c"
#include <stdbool.h>
free(header);
}
-int main(int argc, char** argv)
+int samtools_test_get_rg_sample_main(int argc, char** argv)
{
const int NUM_TESTS = 1;
int success = 0;
if (success == NUM_TESTS) {
return 0;
} else {
- fprintf(pysamerr, "%d failures %d successes\n", failure, success);
+ fprintf(pysam_stderr, "%d failures %d successes\n", failure, success);
return 1;
}
}
-#define SAMTOOLS_VERSION "1.3"
+#define SAMTOOLS_VERSION "1.3.1"
using cython and a high-level API for convenient access to the data
within standard genomic file formats.
-The current version wraps htslib-1.3, samtools-1.3 and bcftools-1.3.
+The current version wraps htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1.
See:
http://www.htslib.org
outf.write(
"/* empty config.h created by pysam */\n")
outf.write(
- "/* conservative compilation options */")
+ "/* conservative compilation options */\n")
if HTSLIB_LIBRARY_DIR:
# linking against a shared, externally installed htslib version, no
"adding shared libcurl and libcrypto")
external_htslib_libraries.extend(["curl", "crypto"])
+# create empty config.h files if they have not been created automatically
+# or created by the user:
+for fn in "samtools/config.h", "htslib/config.h":
+ if not os.path.exists(fn):
+ with open(fn, "w") as outf:
+ outf.write(
+ "/* empty config.h created by pysam */\n")
+ outf.write(
+ "/* conservative compilation options */\n")
+
parts = ["samtools",
"bcftools",
"htslib",
"vcf",
"bcf"]
-# remove existing files to recompute
-# necessary to be both compatible for python 2.7 and 3.3
-if IS_PYTHON3:
- for part in parts:
- try:
- os.unlink("pysam/c%s.c" % part)
- except:
- pass
-
# Exit if there are no pre-compiled files and no cython available
fn = source_pattern % "htslib"
if not os.path.exists(fn):
"pysam.ctabixproxies",
[source_pattern % "tabixproxies"] +
os_c_files,
- library_dirs=[],
+ library_dirs=htslib_library_dirs,
include_dirs=include_os,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
"pysam.cvcf",
[source_pattern % "vcf"] +
os_c_files,
- library_dirs=[],
+ library_dirs=htslib_library_dirs,
include_dirs=["htslib", "."] + include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
import unittest
import collections
import copy
+import array
from TestUtils import checkFieldEqual
(None, 25, 'T'), (None, 26, 'T'),
(5, 27, 'A'), (6, 28, 'A'), (7, 29, 'A'), (8, 30, 'A')]
)
-
+
a.cigarstring = "5M2D2I2M"
a.set_tag("MD", "4C^TT2")
self.assertEqual(
(7, 27, 'A'), (8, 28, 'A')]
)
+ def test_get_aligned_pairs_skip_reference(self):
+ a = self.buildRead()
+ a.query_sequence = "A" * 10
+ a.cigarstring = "5M1N5M"
+ a.set_tag("MD", "10")
+
+ self.assertEqual(
+ a.get_aligned_pairs(with_seq=True),
+ [(0, 20, 'A'), (1, 21, 'A'), (2, 22, 'A'),
+ (3, 23, 'A'), (4, 24, 'A'), (None, 25, None),
+ (5, 26, 'A'), (6, 27, 'A'), (7, 28, 'A'),
+ (8, 29, 'A'), (9, 30, 'A')])
+
+ self.assertEqual(
+ a.get_aligned_pairs(with_seq=False),
+ [(0, 20), (1, 21), (2, 22),
+ (3, 23), (4, 24), (None, 25),
+ (5, 26), (6, 27), (7, 28),
+ (8, 29), (9, 30)])
+
+ self.assertEqual(
+ a.get_aligned_pairs(matches_only=True, with_seq=False),
+ [(0, 20), (1, 21),
+ (2, 22), (3, 23),
+ (4, 24), (5, 26),
+ (6, 27), (7, 28),
+ (8, 29), (9, 30)])
+
def testNoSequence(self):
'''issue 176: retrieving length without query sequence
with soft-clipping.
self.assertEqual(a.query_alignment_length, 20)
+class TestCigarStats(ReadTest):
+
+ def testStats(self):
+
+ a = self.buildRead()
+
+ a.cigarstring = None
+ self.assertEqual(
+ [list(x) for x in a.get_cigar_stats()],
+ [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
+
+ a.cigarstring = "10M"
+ self.assertEqual(
+ [list(x) for x in a.get_cigar_stats()],
+ [[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
+
+ a.cigarstring = "10M2I2M"
+ self.assertEqual(
+ [list(x) for x in a.get_cigar_stats()],
+ [[12, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
+
+ for i, x in enumerate("MIDNSHP=X"):
+ a.cigarstring = "2{}".format(x)
+ expected = [[0] * 11, [0] * 11]
+ expected[0][i] = 2
+ expected[1][i] = 1
+ self.assertEqual(
+ [list(x) for x in a.get_cigar_stats()],
+ expected)
+
+ a.cigarstring = "10M"
+ a.set_tag("NM", 5)
+ self.assertEqual(
+ [list(x) for x in a.get_cigar_stats()],
+ [[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
+ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
+
+ a.cigarstring = None
+ self.assertEqual(
+ [list(x) for x in a.get_cigar_stats()],
+ [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
+
+
class TestAlignedPairs(unittest.TestCase):
filename = os.path.join(DATADIR, "example_aligned_pairs.bam")
def testReferenceBases(self):
"""reference bases should always be the same nucleotide
"""
- reference_bases = collections.defaultdict(list)
+ reference_bases = collections.defaultdict(list)
with pysam.AlignmentFile(self.filename) as inf:
for c in inf.pileup():
for r in c.pileups:
self.assertEqual(False, a.has_tag("NM"))
# check if deleting a non-existing tag is fine
a.set_tag("NM", None)
+ a.set_tag("NM", None)
+ def testArrayTags(self):
+ read = self.buildRead()
+ supported_dtypes = "bhBHf"
+ unsupported_dtypes = "lLd"
+
+ for dtype in supported_dtypes:
+ key = "F" + dtype
+ read.set_tag(key, array.array(dtype, range(10)))
+ ary = read.get_tag(key)
+
+ for dtype in unsupported_dtypes:
+ key = "F" + dtype
+ self.assertRaises(ValueError,
+ read.set_tag,
+ key,
+ array.array(dtype, range(10)))
+
def testAddTagsType(self):
a = self.buildRead()
a.tags = None
"A" * 5 + "C" * 3 + "A" * 5,
a.get_reference_sequence())
+ def testMDTagRefSkipping(self):
+ a = self.buildRead()
+
+ a.cigarstring = "5M1N5M"
+ a.query_sequence = "A" * 10
+ a.set_tag('MD', "10")
+ self.assertEqual(
+ "A" * 10,
+ a.get_reference_sequence())
+
+ a.cigarstring = "5M3N5M"
+ a.query_sequence = "A" * 10
+ a.set_tag('MD', "10")
+ self.assertEqual(
+ "A" * 10,
+ a.get_reference_sequence())
+
def testMDTagSoftClipping(self):
a = self.buildRead()
self.assertEqual(
"A" * 5 + "C" + "A" * 5,
a.get_reference_sequence())
-
+
# all together
a.cigarstring = "5S5M1D5M1I5M5S"
a.query_sequence = "G" * 5 + "A" * 16 + "G" * 5
self.assertEqual(
"AAcAATCAAAAA",
a.get_reference_sequence())
-
+
a.cigarstring = "5S5M2D1I5M5S"
a.query_sequence = "G" * 5 + "A" * 11 + "G" * 5
a.set_tag('MD', "2C2^TC5")
class TestCopy(ReadTest):
-
+
def testCopy(self):
a = self.buildRead()
b = copy.copy(a)
import pysam
import pysam.samtools
from TestUtils import checkBinaryEqual, checkURL, \
- checkSamtoolsViewEqual, checkFieldEqual, force_str
+ check_samtools_view_equal, checkFieldEqual, force_str
DATADIR = "pysam_data"
"rb")
self.reads = list(self.samfile.fetch())
+ def tearDown(self):
+ self.samfile.close()
+
def testARqname(self):
self.assertEqual(
self.reads[0].query_name,
self.assertEqual(self.reads[0].opt("XT"), "U")
self.assertEqual(self.reads[1].opt("XT"), "R")
- def tearDown(self):
- self.samfile.close()
-
class BasicTestSAMFromFetch(BasicTestBAMFromFetch):
The *checkf* is used to determine if the files are
equal.
'''
- infile = pysam.AlignmentFile(
- os.path.join(DATADIR, input_filename),
- input_mode)
-
- if "b" in input_mode:
- self.assertTrue(infile.is_bam)
- self.assertFalse(infile.is_cram)
- elif "c" in input_mode:
- self.assertFalse(infile.is_bam)
- self.assertTrue(infile.is_cram)
- else:
- self.assertFalse(infile.is_cram)
- self.assertFalse(infile.is_bam)
-
- if use_template:
- outfile = pysam.AlignmentFile(
- output_filename,
- output_mode,
- reference_filename=sequence_filename,
- template=infile)
- else:
- outfile = pysam.AlignmentFile(
- output_filename,
- output_mode,
- reference_names=infile.references,
- reference_lengths=infile.lengths,
- reference_filename=sequence_filename,
- add_sq_text=False)
- iter = infile.fetch()
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, input_filename),
+ input_mode) as infile:
+
+ if "b" in input_mode:
+ self.assertTrue(infile.is_bam)
+ self.assertFalse(infile.is_cram)
+ elif "c" in input_mode:
+ self.assertFalse(infile.is_bam)
+ self.assertTrue(infile.is_cram)
+ else:
+ self.assertFalse(infile.is_cram)
+ self.assertFalse(infile.is_bam)
+
+ if use_template:
+ outfile = pysam.AlignmentFile(
+ output_filename,
+ output_mode,
+ reference_filename=sequence_filename,
+ template=infile)
+ else:
+ outfile = pysam.AlignmentFile(
+ output_filename,
+ output_mode,
+ reference_names=infile.references,
+ reference_lengths=infile.lengths,
+ reference_filename=sequence_filename,
+ add_sq_text=False)
- for x in iter:
- outfile.write(x)
+ iter = infile.fetch()
- infile.close()
- outfile.close()
+ for x in iter:
+ outfile.write(x)
+
+ outfile.close()
self.assertTrue(checkf(
os.path.join(DATADIR, reference_filename),
"tmp_ex2.cram",
"rc", "wc",
sequence_filename="pysam_data/ex1.fa",
- checkf=checkSamtoolsViewEqual)
+ checkf=check_samtools_view_equal)
def testSAM2BAM(self):
self.checkEcho("ex2.sam",
"rb", "wc",
sequence_filename="pysam_data/ex1.fa",
checkf=partial(
- checkSamtoolsViewEqual,
+ check_samtools_view_equal,
without_header=True))
def testCRAM2BAM(self):
"rc", "wb",
sequence_filename="pysam_data/ex1.fa",
checkf=partial(
- checkSamtoolsViewEqual,
+ check_samtools_view_equal,
without_header=True))
def testSAM2CRAM(self):
"r", "wc",
sequence_filename="pysam_data/ex1.fa",
checkf=partial(
- checkSamtoolsViewEqual,
+ check_samtools_view_equal,
without_header=True))
def testCRAM2SAM(self):
"rc", "wh",
sequence_filename="pysam_data/ex1.fa",
checkf=partial(
- checkSamtoolsViewEqual,
+ check_samtools_view_equal,
without_header=True))
# Disabled - should work, files are not binary equal, but are
filename = os.path.join(DATADIR, "ex2.bam")
mode = "rb"
+ reference_filename = None
def setUp(self):
self.samfile = pysam.AlignmentFile(
- self.filename, self.mode,
+ self.filename,
+ self.mode,
+ reference_filename=self.reference_filename,
)
+ def tearDown(self):
+ self.samfile.close()
+
def checkRange(self, rnge):
'''compare results from iterator with those from samtools.'''
ps = list(self.samfile.fetch(region=rnge))
self.checkRange("%s:%i-%i" %
(contig, start, start + 90))
- def tearDown(self):
- self.samfile.close()
-
class TestIteratorRowAllBAM(unittest.TestCase):
mode = "rc"
-class TestIteratorRowCRAM(TestIteratorRowBAM):
- filename = os.path.join(DATADIR, "ex2.cram")
- mode = "rc"
+class TestIteratorRowCRAMWithReferenceFilename(TestIteratorRowCRAM):
+ reference_filename = os.path.join(DATADIR, "ex1.fa")
+
##########################################################
##########################################################
filename = os.path.join(DATADIR, 'example_btag.bam')
-class TestDoubleFetch(unittest.TestCase):
-
+class TestDoubleFetchBAM(unittest.TestCase):
'''check if two iterators on the same bamfile are independent.'''
filename = os.path.join(DATADIR, 'ex1.bam')
+ mode = "rb"
def testDoubleFetch(self):
- samfile1 = pysam.AlignmentFile(self.filename, 'rb')
-
- for a, b in zip(samfile1.fetch(multiple_iterators=True),
- samfile1.fetch(multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
+ with pysam.AlignmentFile(self.filename, self.mode) as samfile1:
+ for a, b in zip(samfile1.fetch(multiple_iterators=True),
+ samfile1.fetch(multiple_iterators=True)):
+ self.assertEqual(a.compare(b), 0)
def testDoubleFetchWithRegion(self):
- samfile1 = pysam.AlignmentFile(self.filename, 'rb')
- chr, start, stop = 'chr1', 200, 3000000
- # just making sure the test has something to catch
- self.assertTrue(len(list(samfile1.fetch(chr, start, stop))) > 0)
+ with pysam.AlignmentFile(self.filename, self.mode) as samfile1:
+ contig, start, stop = 'chr1', 200, 3000000
+ # just making sure the test has something to catch
+ self.assertTrue(len(list(samfile1.fetch(contig, start, stop))) > 0)
- for a, b in zip(samfile1.fetch(chr, start, stop),
- samfile1.fetch(chr, start, stop,
- multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
+ # see Issue #293
+ # The following fails for CRAM files, but works for BAM
+ # files when the first is multiple_iterators=False:
+ for a, b in zip(samfile1.fetch(contig, start, stop,
+ multiple_iterators=True),
+ samfile1.fetch(contig, start, stop,
+ multiple_iterators=True)):
+ self.assertEqual(a.compare(b), 0)
def testDoubleFetchUntilEOF(self):
- samfile1 = pysam.AlignmentFile(self.filename, 'rb')
+ with pysam.AlignmentFile(self.filename, self.mode) as samfile1:
+
+ for a, b in zip(samfile1.fetch(until_eof=True),
+ samfile1.fetch(until_eof=True,
+ multiple_iterators=True)):
+ self.assertEqual(a.compare(b), 0)
+
+
+class TestDoubleFetchCRAM(TestDoubleFetchBAM):
+ filename = os.path.join(DATADIR, 'ex2.cram')
+ mode = "rc"
+
- for a, b in zip(samfile1.fetch(until_eof=True),
- samfile1.fetch(until_eof=True,
- multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
+class TestDoubleFetchCRAMWithReference(TestDoubleFetchBAM):
+ filename = os.path.join(DATADIR, 'ex2.cram')
+ mode = "rc"
+ reference_filename = os.path.join(DATADIR, 'ex1.fa')
class TestRemoteFileFTP(unittest.TestCase):
if not checkURL(self.url):
return
- samfile = pysam.AlignmentFile(self.url, "rb")
- result = list(samfile.fetch(region=self.region))
- samfile_local = pysam.AlignmentFile(self.local, "rb")
- ref = list(samfile_local.fetch(region=self.region))
+ with pysam.AlignmentFile(self.url, "rb") as samfile:
+ result = list(samfile.fetch(region=self.region))
+
+ with pysam.AlignmentFile(self.local, "rb") as samfile_local:
+ ref = list(samfile_local.fetch(region=self.region))
self.assertEqual(len(ref), len(result))
for x, y in zip(result, ref):
if not checkURL(self.url):
return
- samfile = pysam.AlignmentFile(self.url, "rb")
- result = list(samfile.fetch())
- samfile_local = pysam.AlignmentFile(self.local, "rb")
- ref = list(samfile_local.fetch())
+ with pysam.AlignmentFile(self.url, "rb") as samfile:
+ result = list(samfile.fetch())
+
+ with pysam.AlignmentFile(self.local, "rb") as samfile_local:
+ ref = list(samfile_local.fetch())
self.assertEqual(len(ref), len(result))
for x, y in zip(result, ref):
self.samfile = pysam.AlignmentFile(self.samfilename)
self.fastafile = pysam.Fastafile(self.fastafilename)
+ def tearDown(self):
+ self.samfile.close()
+ self.fastafile.close()
+
def checkEqual(self, references, iterator):
for x, column in enumerate(iterator):
samfile.close()
pysam.samtools.index("test_count_coverage_read_all.bam")
+ def tearDown(self):
+ self.samfile.close()
+ self.fastafile.close()
+
def count_coverage_python(self, bam, chrom, start, stop,
read_callback,
quality_threshold=15):
self.assertEqual(fast_counts[3], manual_counts[3])
def test_count_coverage_read_all(self):
- samfile = pysam.AlignmentFile("test_count_coverage_read_all.bam")
+
chrom = 'chr1'
start = 0
stop = 2000
def filter(read):
return not (read.flag & (0x4 | 0x100 | 0x200 | 0x400))
- fast_counts = samfile.count_coverage(
- chrom, start, stop,
- read_callback='all',
- #read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
- quality_threshold=0)
- manual_counts = samfile.count_coverage(
- chrom, start, stop,
- read_callback=lambda read: not(
- read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
- quality_threshold=0)
+
+ with pysam.AlignmentFile("test_count_coverage_read_all.bam") as samfile:
+
+ fast_counts = samfile.count_coverage(
+ chrom, start, stop,
+ read_callback='all',
+ #read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
+ quality_threshold=0)
+ manual_counts = samfile.count_coverage(
+ chrom, start, stop,
+ read_callback=lambda read: not(
+ read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
+ quality_threshold=0)
os.unlink("test_count_coverage_read_all.bam")
os.unlink("test_count_coverage_read_all.bam.bai")
samfile.write(read)
samfile.close()
pysam.samtools.index("test_count_coverage_nofilter.bam")
- samfile = pysam.AlignmentFile("test_count_coverage_nofilter.bam")
chr = 'chr1'
start = 0
stop = 2000
- fast_counts = samfile.count_coverage(chr, start, stop,
- read_callback='nofilter',
- quality_threshold=0)
- manual_counts = self.count_coverage_python(samfile, chr, start, stop,
- read_callback=lambda x: True,
- quality_threshold=0)
- samfile.close()
+ with pysam.AlignmentFile("test_count_coverage_nofilter.bam") as samfile:
+
+ fast_counts = samfile.count_coverage(chr, start, stop,
+ read_callback='nofilter',
+ quality_threshold=0)
+
+ manual_counts = self.count_coverage_python(samfile, chr, start, stop,
+ read_callback=lambda x: True,
+ quality_threshold=0)
+
os.unlink("test_count_coverage_nofilter.bam")
os.unlink("test_count_coverage_nofilter.bam.bai")
self.assertEqual(fast_counts[0], manual_counts[0])
class TestPileupQueryPosition(unittest.TestCase):
-
+
filename = "test_query_position.bam"
def testPileup(self):
log_hand.setFormatter(formatter)
logger.addHandler(log_hand)
- bam = pysam.AlignmentFile(bamfile, 'rb')
- cols = bam.pileup()
+ with pysam.AlignmentFile(bamfile, 'rb') as bam:
+ cols = bam.pileup()
self.assertTrue(True)
def testFail1(self):
def testCount(self):
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, "ex1.bam"),
+ "rb") as samfile:
- for contig in ("chr1", "chr2"):
- for start in range(0, 2000, 100):
- end = start + 1
- self.assertEqual(
- len(list(samfile.fetch(contig, start, end))),
- samfile.count(contig, start, end),
- 'number mismatch for %s:%i-%i %i != %i' % (
- contig, start, end,
+ for contig in ("chr1", "chr2"):
+ for start in range(0, 2000, 100):
+ end = start + 1
+ self.assertEqual(
len(list(samfile.fetch(contig, start, end))),
- samfile.count(contig, start, end)))
+ samfile.count(contig, start, end),
+ 'number mismatch for %s:%i-%i %i != %i' % (
+ contig, start, end,
+ len(list(samfile.fetch(contig, start, end))),
+ samfile.count(contig, start, end)))
- # test empty intervals
- self.assertEqual(
- len(list(samfile.fetch(contig, start, start))),
- samfile.count(contig, start, start),
- 'number mismatch for %s:%i-%i %i != %i' % (
- contig, start, start,
+ # test empty intervals
+ self.assertEqual(
len(list(samfile.fetch(contig, start, start))),
- samfile.count(contig, start, start)))
+ samfile.count(contig, start, start),
+ 'number mismatch for %s:%i-%i %i != %i' % (
+ contig, start, start,
+ len(list(samfile.fetch(contig, start, start))),
+ samfile.count(contig, start, start)))
- # test half empty intervals
- self.assertEqual(len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start))
+ # test half empty intervals
+ self.assertEqual(len(list(samfile.fetch(contig, start))),
+ samfile.count(contig, start))
- self.assertEqual(
- len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start),
- 'number mismatch for %s:%i %i != %i' % (
- contig, start,
+ self.assertEqual(
len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start)))
+ samfile.count(contig, start),
+ 'number mismatch for %s:%i %i != %i' % (
+ contig, start,
+ len(list(samfile.fetch(contig, start))),
+ samfile.count(contig, start)))
def testMate(self):
'''test mate access.'''
for x in readnames:
counts[x] += 1
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+ "rb") as samfile:
- for read in samfile.fetch():
- if not read.is_paired:
- self.assertRaises(ValueError, samfile.mate, read)
- elif read.mate_is_unmapped:
- self.assertRaises(ValueError, samfile.mate, read)
- else:
- if counts[read.query_name] == 1:
+ for read in samfile.fetch():
+ if not read.is_paired:
+ self.assertRaises(ValueError, samfile.mate, read)
+ elif read.mate_is_unmapped:
self.assertRaises(ValueError, samfile.mate, read)
else:
- mate = samfile.mate(read)
- self.assertEqual(read.query_name, mate.query_name)
- self.assertEqual(read.is_read1, mate.is_read2)
- self.assertEqual(read.is_read2, mate.is_read1)
- self.assertEqual(
- read.reference_start, mate.next_reference_start)
- self.assertEqual(
- read.next_reference_start, mate.reference_start)
+ if counts[read.query_name] == 1:
+ self.assertRaises(ValueError, samfile.mate, read)
+ else:
+ mate = samfile.mate(read)
+ self.assertEqual(read.query_name, mate.query_name)
+ self.assertEqual(read.is_read1, mate.is_read2)
+ self.assertEqual(read.is_read2, mate.is_read1)
+ self.assertEqual(
+ read.reference_start, mate.next_reference_start)
+ self.assertEqual(
+ read.next_reference_start, mate.reference_start)
def testIndexStats(self):
'''test if total number of mapped/unmapped reads is correct.'''
- samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- self.assertEqual(samfile.mapped, 3235)
- self.assertEqual(samfile.unmapped, 35)
- self.assertEqual(samfile.nocoordinate, 0)
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+ "rb") as samfile:
+ self.assertEqual(samfile.mapped, 3235)
+ self.assertEqual(samfile.unmapped, 35)
+ self.assertEqual(samfile.nocoordinate, 0)
class TestMappedUnmapped(unittest.TestCase):
class TestExplicitIndex(unittest.TestCase):
def testExplicitIndexBAM(self):
- samfile = pysam.AlignmentFile(
- os.path.join(DATADIR, "explicit_index.bam"),
- "rb",
- filepath_index=os.path.join(DATADIR, 'ex1.bam.bai'))
-
- samfile.fetch("chr1")
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, "explicit_index.bam"),
+ "rb",
+ filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) as samfile:
+ samfile.fetch("chr1")
def testExplicitIndexCRAM(self):
- samfile = pysam.AlignmentFile(
- os.path.join(DATADIR, "explicit_index.cram"),
- "rc",
- filepath_index=os.path.join(DATADIR, 'ex1.cram.crai'))
+ with pysam.AlignmentFile(
+ os.path.join(DATADIR, "explicit_index.cram"),
+ "rc",
+ filepath_index=os.path.join(DATADIR, 'ex1.cram.crai')) as samfile:
+ samfile.fetch("chr1")
def testRemoteExplicitIndexBAM(self):
- samfile = pysam.AlignmentFile(
- "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam",
- "rb",
- filepath_index=os.path.join(DATADIR, 'ex1.bam.bai'))
+ if not checkURL(
+ "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam"):
+ return
- samfile.fetch("chr1")
+ with pysam.AlignmentFile(
+ "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam",
+ "rb",
+ filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) as samfile:
+ samfile.fetch("chr1")
class TestVerbosity(unittest.TestCase):
if IS_PYTHON3:
def force_str(s):
- return s.decode('ascii')
+ try:
+ return s.decode('ascii')
+ except AttributeError:
+ return s
+ def force_bytes(s):
+ try:
+ return s.encode('ascii')
+ except AttributeError:
+ return s
else:
def force_str(s):
return s
+ def force_bytes(s):
+ return s
def openfile(fn):
if fn.endswith(".gz"):
- return gzip.open(fn)
+ try:
+ return gzip.open(fn, "rt", encoding="utf-8")
+ except TypeError:
+ return gzip.open(fn, "r")
else:
return open(fn)
return found
-def checkSamtoolsViewEqual(filename1, filename2,
- without_header=False):
+def check_samtools_view_equal(
+ filename1, filename2,
+ without_header=False):
'''return true if the two files are equal in their
content through samtools view.
'''
(n, getattr(read1, n), getattr(read2, n)))
-def check_lines_equal(cls, a, b, sort=False, filter_f=None):
+def check_lines_equal(cls, a, b, sort=False, filter_f=None, msg=None):
"""check if contents of two files are equal comparing line-wise.
sort: bool
filter_f:
remover lines in both a and b where expression is True
"""
-
aa = openfile(a).readlines()
bb = openfile(b).readlines()
if filter_f is not None:
- aa = [x for x in aa if not filter_f]
- bb = [x for x in bb if not filter_f]
+ aa = [x for x in aa if not filter_f(x)]
+ bb = [x for x in bb if not filter_f(x)]
+
if sort:
- cls.assertEqual(sorted(aa), sorted(bb))
+ cls.assertEqual(sorted(aa), sorted(bb), msg)
else:
- cls.assertEqual(aa, bb)
+ cls.assertEqual(aa, bb, msg)
def get_temp_filename(suffix=""):
import unittest
import pysam
import gzip
+import subprocess
from TestUtils import get_temp_filename, check_lines_equal
DATADIR="cbcf_data"
def read_header(filename):
-
data = []
if filename.endswith(".gz"):
for line in gzip.open(filename):
for line in f:
if line.startswith("#"):
data.append(line)
+
return data
self.assertEqual(len(list(inf.fetch())), 5)
def testDetectBCF(self):
- with pysam.VariantFile(os.path.join(DATADIR,
- "example_vcf40.bcf")) as inf:
+ with pysam.VariantFile(os.path.join(
+ DATADIR,
+ "example_vcf40.bcf")) as inf:
self.assertEqual(inf.category, 'VARIANTS')
self.assertEqual(inf.format, 'BCF')
self.assertEqual(inf.compression, 'BGZF')
check_lines_equal(
self, fn_in, fn_out, sort=True,
- filter_f=lambda x: not x.startswith("##contig"))
+ filter_f=lambda x: x.startswith("##contig"))
os.unlink(fn_out)
def testConstructionWithRecords(self):
filename = "example_vcf42.vcf.gz"
+class TestSettingRecordValues(unittest.TestCase):
+
+ filename = "example_vcf40.vcf"
+
+ def testSetQual(self):
+ with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+ record = next(inf)
+ self.assertEqual(record.qual, 47)
+ record.qual = record.qual
+ self.assertEqual(record.qual, 47)
+ record.qual = 10
+ self.assertEqual(record.qual, 10)
+ self.assertEqual(str(record).split("\t")[5], "10")
+
+ def testGenotype(self):
+ with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+ record = next(inf)
+ sample = record.samples["NA00001"]
+ print (sample["GT"])
+ self.assertEqual(sample["GT"], (0, 0))
+# Fails with TypeError
+# sample["GT"] = sample["GT"]
+
+class TestSubsetting(unittest.TestCase):
+
+ filename = "example_vcf42.vcf.gz"
+
+ def testSubsetting(self):
+ with pysam.VariantFile(os.path.join(DATADIR,
+ self.filename)) as inf:
+ inf.subset_samples(["NA00001"])
+
if __name__ == "__main__":
+ # build data files
+ print ("building data files")
+ subprocess.call("make -C %s" % DATADIR, shell=True)
+ print ("starting tests")
unittest.main()
+ print ("completed tests")
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
-M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:.
17 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
-20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
-20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.
20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
import unittest
import os
import gzip
+import shutil
from TestUtils import checkURL
self.file.close()
+class TestFastaFilePathIndex(unittest.TestCase):
+
+ filename = os.path.join(DATADIR, "ex1.fa")
+
+ def testGarbageIndex(self):
+ self.assertRaises(NotImplementedError,
+ pysam.FastaFile,
+ self.filename,
+ filepath_index="garbage.fa.fai")
+ return
+
+ self.assertRaises(ValueError,
+ pysam.FastaFile,
+ self.filename,
+ filepath_index="garbage.fa.fai")
+
+ def testOpenWithoutIndex(self):
+ faidx = pysam.FastaFile(self.filename)
+ faidx.close()
+
+ def testOpenWithStandardIndex(self):
+ self.assertRaises(NotImplementedError,
+ pysam.FastaFile,
+ self.filename,
+ filepath_index=self.filename + ".fai")
+ return
+
+ faidx = pysam.FastaFile(self.filename,
+ filepath_index=self.filename + ".fai")
+ faidx.close()
+
+ def testOpenWithOtherIndex(self):
+ return
+ tmpfilename = "tmp_" + os.path.basename(self.filename)
+ shutil.copyfile(self.filename, tmpfilename)
+ faidx = pysam.FastaFile(tmpfilename,
+ filepath_index=self.filename + ".fai")
+ faidx.close()
+ # index should not be auto-generated
+ self.assertFalse(os.path.exists(tmpfilename + ".fai"))
+ os.unlink(tmpfilename)
+
+class TestFastaFilePathIndexCompressed(TestFastaFilePathIndex):
+
+ filename = os.path.join(DATADIR, "ex1.fa.gz")
+
+
class TestFastxFileFastq(unittest.TestCase):
filetype = pysam.FastxFile
persist=self.persist)
self.has_quality = self.filename.endswith('.fq')
+ def tearDown(self):
+ self.file.close()
+
def checkFirst(self, s):
# test first entry
self.assertEqual(s.sequence, "GGGAACAGGGGGGTGCACTAATGCGCTCCACGCCC")
with gzip.open(fn) as inf:
ref_num = len(list(inf)) / 4
- f = self.filetype(fn)
- l = len(list(f))
+ with self.filetype(fn) as f:
+ l = len(list(f))
self.assertEqual(ref_num, l)
def testFTPView(self):
if not checkURL(self.url):
return
- f = pysam.Fastafile(self.url)
- self.assertEqual(
- len(f.fetch("chr1", 0, 1000)),
- 1000)
+ with pysam.Fastafile(self.url) as f:
+ self.assertEqual(
+ len(f.fetch("chr1", 0, 1000)),
+ 1000)
if __name__ == "__main__":
ex2_truncated.bam \
empty.bam empty.bam.bai \
explicit_index.bam explicit_index.cram \
- faidx_empty_seq.fq.gz
+ faidx_empty_seq.fq.gz \
+ ex1.fa.gz ex1.fa.gz.fai
# ex2.sam - as ex1.sam, but with header
ex2.sam.gz: ex1.bam ex1.bam.bai
%.fq.gz: %.fq
gzip < $< > $@
+
+%.fa.gz: %.fa
+ bgzip < $< > $@
+
+%.fa.gz.fai: %.fa.gz
+ samtools faidx $<
import sys
import subprocess
import shutil
-from TestUtils import checkBinaryEqual
+from TestUtils import checkBinaryEqual, check_lines_equal, \
+ check_samtools_view_equal, get_temp_filename, force_bytes
IS_PYTHON3 = sys.version_info[0] >= 3
"idxstats ex1.bam > %(out)s_ex1.idxstats",
"fixmate ex1.bam %(out)s_ex1.fixmate.bam",
"flagstat ex1.bam > %(out)s_ex1.flagstat",
+ # Fails python 3.3 on linux, passes on OsX and when
+ # run locally
"calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam",
# use -s option, otherwise the following error in samtools 1.2:
# Samtools-htslib-API: bam_get_library() not yet implemented
samtools_version))
def setUp(self):
- '''setup tests.
+ '''setup tests.
For setup, all commands will be run before the first test is
executed. Individual tests will then just compare the output
os.makedirs(WORKDIR)
for f in self.requisites:
- shutil.copy(os.path.join(DATADIR, f),
+ shutil.copy(os.path.join(DATADIR, f),
os.path.join(WORKDIR, f))
self.savedir = os.getcwd()
output = pysam_method(*pysam_parts,
raw=True,
catch_stdout=True)
-
# sys.stdout.write(" pysam ok\n")
-
if ">" in statement:
with open(pysam_targets[-1], "wb") as outfile:
if output is not None:
- outfile = outfile.write(output)
+ outfile.write(force_bytes(output))
for samtools_target, pysam_target in zip(samtools_targets,
pysam_targets):
else:
samtools_files = [samtools_target]
pysam_files = [pysam_target]
-
+
for s, p in zip(samtools_files, pysam_files):
- self.assertTrue(
- checkBinaryEqual(s, p),
- "%s failed: files %s and %s are not the same" %
- (command, s, p))
+ binary_equal = checkBinaryEqual(s, p)
+ error_msg = "%s failed: files %s and %s are not the same" % (command, s, p)
+ if binary_equal:
+ continue
+ if s.endswith(".bam"):
+ self.assertTrue(
+ check_samtools_view_equal(
+ s, p, without_header=True),
+ error_msg)
+ check_lines_equal(
+ self, s, p,
+ filter_f=lambda x: x.startswith("#"),
+ msg=error_msg)
def testStatements(self):
for statement in self.statements:
+ if (statement.startswith("calmd") and
+ list(sys.version_info[:2]) == [3, 3]):
+ # skip calmd test, fails only on python 3.3.5
+ # in linux (empty output). Works in OsX and passes
+ # for 3.4 and 3.5, see issue #293
+ continue
self.check_statement(statement)
-
+
def tearDown(self):
if os.path.exists(WORKDIR):
shutil.rmtree(WORKDIR)
self.assertRaises(IOError, pysam.samtools.index,
"exdoesntexist.bam")
+class TestReturnType(unittest.TestCase):
+
+ def testReturnValueString(self):
+ retval = pysam.idxstats(os.path.join(DATADIR, "ex1.bam"))
+ if IS_PYTHON3:
+ self.assertFalse(isinstance(retval, bytes))
+ self.assertTrue(isinstance(retval, str))
+ else:
+ self.assertTrue(isinstance(retval, bytes))
+ self.assertTrue(isinstance(retval, basestring))
+
+ def testReturnValueData(self):
+ args = "-O BAM {}".format(os.path.join(DATADIR, "ex1.bam")).split(" ")
+ retval = pysam.view(*args)
+
+ if IS_PYTHON3:
+ self.assertTrue(isinstance(retval, bytes))
+ self.assertFalse(isinstance(retval, str))
+ else:
+ self.assertTrue(isinstance(retval, bytes))
+ self.assertTrue(isinstance(retval, basestring))
+
class StdoutTest(unittest.TestCase):
'''test if stdout can be redirected.'''
catch_stdout=False)
self.assertEqual(r, None)
+ def testDoubleCalling(self):
+ # The following would fail if there is an
+ # issue with stdout being improperly caught.
+ retvals = pysam.idxstats(
+ os.path.join(DATADIR, "ex1.bam"))
+ retvals = pysam.idxstats(
+ os.path.join(DATADIR, "ex1.bam"))
+
+ def testSaveStdout(self):
+ outfile = get_temp_filename(suffix=".tsv")
+ r = pysam.samtools.flagstat(
+ os.path.join(DATADIR, "ex1.bam"),
+ save_stdout=outfile)
+ self.assertEqual(r, None)
+ with open(outfile) as inf:
+ r = inf.read()
+ self.assertTrue(len(r) > 0)
+
class PysamTest(SamtoolsTest):
"""check access to samtools command in the pysam
main package.
-
+
This is for backwards capability.
"""
IterationTest.setUp(self)
self.tabix = pysam.TabixFile(self.filename)
+ def tearDown(self):
+ self.tabix.close()
+
def testRegionStrings(self):
"""test if access with various region strings
works"""
self.tabix.fetch("chr1", 100, 100)
def testGetContigs(self):
- self.assertEqual(sorted(self.tabix.contigs), [b"chr1", b"chr2"])
+ self.assertEqual(sorted(self.tabix.contigs), ["chr1", "chr2"])
# check that contigs is read-only
self.assertRaises(
AttributeError, setattr, self.tabix, "contigs", ["chr1", "chr2"])
# opens any tabix file
with pysam.TabixFile(self.filename) as inf:
pass
-
+
for i in range(1000):
func1()
- def tearDown(self):
- self.tabix.close()
-
class TestIterationWithComments(TestIterationWithoutComments):
self.tabix = pysam.TabixFile(self.filename)
self.compare = loadAndConvert(self.filename)
+ def tearDown(self):
+ self.tabix.close()
+
def testRead(self):
for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
self.assertEqual(a, b)
+class TestGTF(TestParser):
+
+ def testRead(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())):
+ c = self.compare[x]
+ self.assertEqual(len(c), len(r))
+ self.assertEqual(list(c), list(r))
+ self.assertEqual(c, str(r).split("\t"))
+ self.assertTrue(r.gene_id.startswith("ENSG"))
+ if r.feature != 'gene':
+ self.assertTrue(r.transcript_id.startswith("ENST"))
+ self.assertEqual(c[0], r.contig)
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+
+ def testSetting(self):
+
+ for r in self.tabix.fetch(parser=pysam.asGTF()):
+ r.contig = r.contig + "_test"
+ r.source = r.source + "_test"
+ r.feature = r.feature + "_test"
+ r.start += 10
+ r.end += 10
+ r.score = 20
+ r.strand = "+"
+ r.frame = 0
+ r.attributes = 'gene_id "0001";'
+
+
class TestIterators(unittest.TestCase):
filename = os.path.join(DATADIR, "example.gtf.gz")
open(self.tmpfilename_uncompressed, "wb") as outfile:
outfile.write(infile.read())
+ def tearDown(self):
+ self.tabix.close()
+ os.unlink(self.tmpfilename_uncompressed)
+
def open(self):
if self.is_compressed:
# Not implemented
# self.assertRaises(ValueError, i.next)
- def tearUp(self):
- os.unlink(self.tmpfilename_uncompressed)
-
class TestIteratorsGenericCompressed(TestIterators):
is_compressed = True
is_compressed = False
-class TestGTF(TestParser):
-
- def testRead(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())):
- c = self.compare[x]
- self.assertEqual(len(c), len(r))
- self.assertEqual(list(c), list(r))
- self.assertEqual(c, str(r).split("\t"))
- self.assertTrue(r.gene_id.startswith("ENSG"))
- if r.feature != 'gene':
- self.assertTrue(r.transcript_id.startswith("ENST"))
- self.assertEqual(c[0], r.contig)
- self.assertEqual("\t".join(map(str, c)),
- str(r))
-
-
class TestIterationMalformattedGTFFiles(unittest.TestCase):
'''test reading from malformatted gtf files.'''
self.tabix = pysam.TabixFile(self.filename)
self.compare = loadAndConvert(self.filename)
+ def tearDown(self):
+ self.tabix.close()
+
def testRead(self):
for x, r in enumerate(self.tabix.fetch(parser=pysam.asBed())):
self.assertEqual(int(c[2]) + 1, r.end)
self.assertEqual(str(int(c[2]) + 1), r[2])
- def tearDown(self):
- self.tabix.close()
-
class TestVCF(unittest.TestCase):
self.tabix = pysam.TabixFile(self.tmpfilename + ".gz")
self.compare = loadAndConvert(self.filename)
+ def tearDown(self):
+ self.tabix.close()
+
def testRead(self):
ncolumns = len(self.columns)
c[ncolumns + y] = "test_%i" % y
r[y] = "test_%i" % y
self.assertEqual(c[ncolumns + y], r[y])
-
- def tearDown(self):
- self.tabix.close()
class TestVCFFromVCF(TestVCF):
self.vcf = pysam.VCF()
self.compare = loadAndConvert(self.filename, encode=False)
+ def tearDown(self):
+ self.vcf.close()
+
def testConnecting(self):
fn = os.path.basename(self.filename)
def get_iterator(self):
- f = open(self.filename)
- fn = os.path.basename(self.filename)
+ with open(self.filename) as f:
+ fn = os.path.basename(self.filename)
- for x, msg in self.fail_on_opening:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError, self.vcf.parse, f)
- return
+ for x, msg in self.fail_on_opening:
+ if "%i.vcf" % x == fn:
+ self.assertRaises(ValueError, self.vcf.parse, f)
+ return
- return self.vcf.parse(f)
+ for vcf_code, msg in self.fail_on_parsing:
+ if "%i.vcf" % vcf_code == fn:
+ self.assertRaises((ValueError,
+ AssertionError),
+ list, self.vcf.parse(f))
+ return
+ # python 2.7
+ # self.assertRaisesRegexp(
+ # ValueError, re.compile(msg), self.vcf.parse, f)
+
+ return list(self.vcf.parse(f))
def get_field_value(self, record, field):
return record[field]
missing_value = None
missing_quality = None
+ vcf = None
+
def filter2value(self, r, v):
if r == "PASS":
return ["PASS"], list(v)
TestVCF.setUp(self)
self.compare = loadAndConvert(self.filename, encode=False)
+ def tearDown(self):
+ if self.vcf:
+ self.vcf.close()
+ self.vcf = None
+
def get_iterator(self):
- vcf = pysam.VariantFile(self.filename)
- return vcf.fetch()
+ self.vcf = pysam.VariantFile(self.filename)
+ return self.vcf.fetch()
def get_field_value(self, record, field):
return getattr(record, field)
local = os.path.join(DATADIR, "example.gtf.gz")
def setUp(self):
+ if not checkURL(self.url):
+ self.remote_file = None
+ return
+
self.remote_file = pysam.TabixFile(self.url, "r")
self.local_file = pysam.TabixFile(self.local, "r")
+ def tearDown(self):
+ if self.remote_file is None:
+ return
+
+ self.remote_file.close()
+ self.local_file.close()
+
def testFetchAll(self):
- if not checkURL(self.url):
+ if self.remote_file is None:
return
remote_result = list(self.remote_file.fetch())
self.assertEqual(x, y)
def testHeader(self):
+ if self.remote_file is None:
+ return
+
self.assertEqual(list(self.local_file.header), [])
self.assertRaises(AttributeError,
getattr,
self.remote_file,
"header")
- def tearDown(self):
- self.remote_file.close()
- self.local_file.close()
-
class TestIndexArgument(unittest.TestCase):
shutil.copyfile(self.index_src, self.index_dst)
with pysam.TabixFile(
- self.filename_src, "r", index=self.index_src) as \
- same_basename_file:
+ self.filename_src, "r", index=self.index_src) as same_basename_file:
same_basename_results = list(same_basename_file.fetch())
with pysam.TabixFile(
- self.filename_dst, "r", index=self.index_dst) as \
- diff_index_file:
+ self.filename_dst, "r", index=self.index_dst) as diff_index_file:
diff_index_result = list(diff_index_file.fetch())
self.assertEqual(len(same_basename_results), len(diff_index_result))
def testDoubleFetch(self):
- with pysam.TabixFile(self.filename) as f:
+ with pysam.TabixFile(self.filename) as f:
for a, b in zip(f.fetch(multiple_iterators=True),
f.fetch(multiple_iterators=True)):
--- /dev/null
+import pysam
+
+def test_idxstats_parse_split_lines():
+ bam_filename = "./pysam_data/ex2.bam"
+ lines = pysam.idxstats(bam_filename, split_lines=True) # Test pysam 0.8.X style output, which returns a list of lines
+ for line in lines:
+ _seqname, _seqlen, nmapped, _nunmapped = line.split()
+
+
+def test_bedcov_split_lines():
+ bam_filename = "./pysam_data/ex1.bam"
+ bed_filename = "./pysam_data/ex1.bed"
+ lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True) # Test pysam 0.8.X style output, which returns a list of lines
+ for line in lines:
+ fields = line.split('\t')
+ assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % (fields, len(fields))
+
+
+def test_idxstats_parse():
+ bam_filename = "./pysam_data/ex2.bam"
+ idxstats_string = pysam.idxstats(bam_filename, split_lines=False) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
+ lines = idxstats_string.splitlines()
+ for line in lines:
+ splt = line.split("\t")
+ _seqname, _seqlen, nmapped, _nunmapped = splt
+
+
+def test_bedcov():
+ bam_filename = "./pysam_data/ex1.bam"
+ bed_filename = "./pysam_data/ex1.bed"
+ bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
+ lines = bedcov_string.splitlines()
+ for line in lines:
+ fields = line.split('\t')
+ assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % (fields, len(fields))