From: Afif Elghraoui Date: Sun, 19 Jun 2016 21:17:37 +0000 (-0700) Subject: Imported Upstream version 0.9.1+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~12^2~19 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=f4708100096e3853ab9749dbb378225683d04be6;p=python-pysam.git Imported Upstream version 0.9.1+ds --- diff --git a/INSTALL b/INSTALL index 30fe770..5ddff7f 100644 --- a/INSTALL +++ b/INSTALL @@ -15,7 +15,7 @@ manually modifying one line in Makefile. curl -Pysam requires Python (2.6 or greater) and Cython (0.22 or greater). +Pysam requires Python (2.7 or greater) and Cython (0.22 or greater). It has not been tested on many other platforms. Compilation diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index 6f22272..d4e856d 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -26,6 +26,7 @@ THE SOFTWARE. */ #define BCFTOOLS_H #include +#include #include #include @@ -37,7 +38,7 @@ THE SOFTWARE. */ #define FT_STDIN (1<<3) char *bcftools_version(void); -void error(const char *format, ...); +void error(const char *format, ...) HTS_NORETURN; void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 7a615fe..051f353 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -623,7 +623,7 @@ int main_consensus(int argc, char *argv[]) {"chain",1,0,'c'}, {0,0,0,0} }; - char c; + int c; while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0) { switch (c) diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 7765d6b..91aa5ae 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -87,7 +87,7 @@ args_t; static chain_t* init_chain(chain_t *chain, int ref_ori_pos) { -// fprintf(pysamerr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos); +// fprintf(pysam_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos); chain = (chain_t*) calloc(1,sizeof(chain_t)); chain->num = 0; chain->block_lengths = NULL; @@ -157,7 +157,7 @@ static void print_chain(args_t *args) static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len) { -// fprintf(pysamerr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); +// fprintf(pysam_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); int num = chain->num; if (ref_start <= chain->ref_last_block_ori) { @@ -218,7 +218,7 @@ static void init_data(args_t *args) args->fp_out = fopen(args->output_fname,"w"); if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } - else args->fp_out = stdout; + else args->fp_out = pysam_stdout; } static void destroy_data(args_t *args) @@ -257,7 +257,7 @@ static void init_region(args_t *args, char *line) } } args->rid = bcf_hdr_name2id(args->hdr,line); - if ( args->rid<0 ) fprintf(pysamerr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); + if ( args->rid<0 ) fprintf(pysam_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); args->fa_buf.l = 0; args->fa_length = 0; args->fa_end_pos = to; @@ -342,7 +342,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( rec->pos <= args->fa_frz_pos ) { - fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); + fprintf(pysam_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( args->mask ) @@ -428,7 +428,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { - // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); + // fprintf(pysam_stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); char tmp = 0; if ( args->fa_buf.l - idx > rec->rlen ) { @@ -589,23 +589,23 @@ static void consensus(args_t *args) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Create consensus sequence by applying VCF variants to a reference\n"); - fprintf(pysamerr, " fasta file.\n"); - fprintf(pysamerr, "Usage: bcftools consensus [OPTIONS] \n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysamerr, " -H, --haplotype <1|2> apply variants for the given haplotype\n"); - fprintf(pysamerr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(pysamerr, " -m, --mask replace regions with N\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -c, --chain write a chain file for liftover\n"); - fprintf(pysamerr, " -s, --sample apply variants of the given sample\n"); - fprintf(pysamerr, "Examples:\n"); - fprintf(pysamerr, " # Get the consensus for one region. The fasta header lines are then expected\n"); - fprintf(pysamerr, " # in the form \">chr:from-to\".\n"); - fprintf(pysamerr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference\n"); + fprintf(pysam_stderr, " fasta file.\n"); + fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] \n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(pysam_stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n"); + fprintf(pysam_stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(pysam_stderr, " -m, --mask replace regions with N\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -c, --chain write a chain file for liftover\n"); + fprintf(pysam_stderr, " -s, --sample apply variants of the given sample\n"); + fprintf(pysam_stderr, "Examples:\n"); + fprintf(pysam_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); + fprintf(pysam_stderr, " # in the form \">chr:from-to\".\n"); + fprintf(pysam_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -625,7 +625,7 @@ int main_consensus(int argc, char *argv[]) {"chain",1,0,'c'}, {0,0,0,0} }; - char c; + int c; while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0) { switch (c) diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index ee27882..084ef50 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -197,7 +197,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break; case BCF_BT_CHAR: kputc(info->v1.i, str); break; - default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; } } else if ( fmt->subscript >=0 ) @@ -218,7 +218,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break; - default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH } @@ -730,7 +730,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; - fprintf(pysamerr,"Warning: Assuming INFO/%s\n", key); + fprintf(pysam_stderr,"Warning: Assuming INFO/%s\n", key); } } } @@ -896,7 +896,7 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * char *p = convert->format_str; while ( *p ) { - //fprintf(pysamerr,"<%s>\n", p); + //fprintf(pysam_stderr,"<%s>\n", p); switch (*p) { case '[': is_gtf = 1; p++; break; diff --git a/bcftools/em.c.pysam.c b/bcftools/em.c.pysam.c index 758d919..8109152 100644 --- a/bcftools/em.c.pysam.c +++ b/bcftools/em.c.pysam.c @@ -74,7 +74,7 @@ static double prob1(double f, void *data) minaux1_t *a = (minaux1_t*)data; double p = 1., l = 0., f3[3]; int i; -// printf("brent %lg\n", f); +// fprintf(pysam_stdout, "brent %lg\n", f); if (f < 0 || f > 1) return 1e300; f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; for (i = a->beg; i < a->end; ++i) { @@ -90,7 +90,7 @@ static double freq_iter(double *f, const double *_pdg, int beg, int end) { double f0 = *f, f3[3], err; int i; -// printf("em %lg\n", *f); +// fprintf(pysam_stdout, "em %lg\n", *f); f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; for (i = beg, f0 = 0.; i < end; ++i) { const double *pdg = _pdg + i * 3; @@ -128,7 +128,7 @@ static double g3_iter(double g[3], const double *_pdg, int beg, int end) double err, gg[3]; int i; gg[0] = gg[1] = gg[2] = 0.; -// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); +// fprintf(pysam_stdout, "%lg,%lg,%lg\n", g[0], g[1], g[2]); for (i = beg; i < end; ++i) { double sum, tmp[3]; const double *pdg = _pdg + i * 3; @@ -237,7 +237,7 @@ static int pair_freq_iter(int n, double *pdg[2], double f[4]) { double ff[4]; int i, k, h; -// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); +// fprintf(pysam_stdout, "%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); memset(ff, 0, 4 * sizeof(double)); for (i = 0; i < n; ++i) { double *p[2], sum, tmp; diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 7520106..531339e 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -360,7 +360,7 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break; - default: fprintf(pysamerr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH return -1; // this shouldn't happen @@ -586,7 +586,7 @@ gt_length_too_big: case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(pysamerr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break; + default: fprintf(pysam_stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break; } #undef BRANCH @@ -1045,7 +1045,7 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) { \ if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ } \ - /*fprintf(pysamerr,"pass=%d\n", pass_site);*/ \ + /*fprintf(pysam_stderr,"pass=%d\n", pass_site);*/ \ (ret) = pass_site; \ } \ } @@ -1394,16 +1394,16 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) if ( tok->tok_type==TOK_VAL ) { if ( tok->key ) - fprintf(pysamerr,"%s", tok->key); + fprintf(pysam_stderr,"%s", tok->key); else if ( tok->tag ) - fprintf(pysamerr,"%s", tok->tag); + fprintf(pysam_stderr,"%s", tok->tag); else - fprintf(pysamerr,"%e", tok->threshold); + fprintf(pysam_stderr,"%e", tok->threshold); } else - fprintf(pysamerr,"%c", TOKEN_STRING[tok->tok_type]); - if ( tok->setter ) fprintf(pysamerr,"\t[setter %p]", tok->setter); - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"%c", TOKEN_STRING[tok->tok_type]); + if ( tok->setter ) fprintf(pysam_stderr,"\t[setter %p]", tok->setter); + fprintf(pysam_stderr,"\n"); } } @@ -1427,8 +1427,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); - //fprintf(pysamerr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); - //int i; for (i=0; i @@ -60,6 +60,23 @@ static inline void khash_str2str_destroy_free(void *_hash) kh_destroy(str2str, hash); } +/* + * Destroys the hash structure, the keys and the values + */ +static inline void khash_str2str_destroy_free_all(void *_hash) +{ + khash_t(str2str) *hash = (khash_t(str2str)*)_hash; + khint_t k; + if (hash == 0) return; + for (k = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k)) + { + free((char*)kh_key(hash, k)); + free((char*)kh_val(hash, k)); + } + kh_destroy(str2str, hash); +} + /* * Returns value if key exists or NULL if not */ diff --git a/bcftools/main.c b/bcftools/main.c index f08b5c7..1892c1d 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -1,6 +1,6 @@ /* main.c -- main bcftools command front-end. - Copyright (C) 2012-2015 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -219,7 +219,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index f180e56..f578442 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -2,7 +2,7 @@ /* main.c -- main bcftools command front-end. - Copyright (C) 2012-2015 Genome Research Ltd. + Copyright (C) 2012-2016 Genome Research Ltd. Author: Petr Danecek @@ -218,24 +218,24 @@ static void usage(FILE *fp) int bcftools_main(int argc, char *argv[]) { - if (argc < 2) { usage(pysamerr); return 1; } + if (argc < 2) { usage(pysam_stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version()); + fprintf(pysam_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL - printf("License GPLv3+: GNU GPL version 3 or later \n"); + fprintf(pysam_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); #else - printf("License Expat: The MIT/Expat license\n"); + fprintf(pysam_stdout, "License Expat: The MIT/Expat license\n"); #endif - printf("This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n"); + fprintf(pysam_stdout, "This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n"); return 0; } else if (strcmp(argv[1], "--version-only") == 0) { - printf("%s+htslib-%s\n", bcftools_version(), hts_version()); + fprintf(pysam_stdout, "%s+htslib-%s\n", bcftools_version(), hts_version()); return 0; } else if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) { - if (argc == 2) { usage(stdout); return 0; } + if (argc == 2) { usage(pysam_stdout); return 0; } // Otherwise change "bcftools help COMMAND [...]" to "bcftools COMMAND"; // main_xyz() functions by convention display the subcommand's usage // when invoked without any arguments. @@ -260,7 +260,7 @@ int bcftools_main(int argc, char *argv[]) } i++; } - fprintf(pysamerr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]); + fprintf(pysam_stderr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]); return 1; } diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c index b4c4a99..29ed799 100644 --- a/bcftools/mcall.c.pysam.c +++ b/bcftools/mcall.c.pysam.c @@ -288,7 +288,7 @@ void mcall_init(call_t *call) call->theta *= aM; if ( call->theta >= 1 ) { - fprintf(pysamerr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta); + fprintf(pysam_stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta); call->theta = 0.99; } call->theta = log(call->theta); @@ -516,13 +516,13 @@ float calc_ICB(int nref, int nalt, int nhets, int ndiploid) double q = 2*fref*falt; // probability of a het, assuming HWE double mean = q*ndiploid; - //fprintf(pysamerr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid); + //fprintf(pysam_stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid); // Can we use normal approximation? The second condition is for performance only // and is not well justified. if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 ) { - //fprintf(pysamerr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); + //fprintf(pysam_stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))); } @@ -1032,12 +1032,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n if ( igt==GT_SKIP ) continue; lk += gl[igt]; npresent++; - // fprintf(pysamerr," %e", gl[igt]); + // fprintf(pysam_stderr," %e", gl[igt]); } - // fprintf(pysamerr,"\t\t"); + // fprintf(pysam_stderr,"\t\t"); double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1; // with missing genotypes Pkij's are different lk += log(1 - trio_Pm * (1 - Pkij)); - // fprintf(pysamerr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij); + // fprintf(pysam_stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij); if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; } if ( uc_itr==trio[itr] ) uc_is_mendelian = 1; } @@ -1045,10 +1045,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n if ( !uc_is_mendelian ) { uc_lk += log(1 - trio_Pm); - // fprintf(pysamerr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); + // fprintf(pysam_stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; } } - // fprintf(pysamerr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); + // fprintf(pysam_stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); // Set genotypes for father, mother, child and calculate genotype qualities for (i=0; i<3; i++) @@ -1429,7 +1429,7 @@ int mcall(call_t *call, bcf1_t *rec) int out_als, nout; if ( nals > 8*sizeof(out_als) ) { - fprintf(pysamerr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); + fprintf(pysam_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); return 0; } nout = mcall_find_best_alleles(call, nals, &out_als); @@ -1473,7 +1473,7 @@ int mcall(call_t *call, bcf1_t *rec) { if ( nout>4 ) { - fprintf(pysamerr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); + fprintf(pysam_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); return 0; } mcall_call_trio_genotypes(call, rec, nals,nout,out_als); diff --git a/bcftools/ploidy.c b/bcftools/ploidy.c index 160bc3e..719e175 100644 --- a/bcftools/ploidy.c +++ b/bcftools/ploidy.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2014 Genome Research Ltd. +/* + Copyright (C) 2014-2016 Genome Research Ltd. Author: Petr Danecek @@ -98,7 +98,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s); sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]); ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex); - ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt; + ploidy->sex2dflt[ploidy->nsex-1] = -1; } ss = se; @@ -106,8 +106,8 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v if ( !*se ) error("Could not parse: %s\n", line); sp->ploidy = strtol(ss,&se,10); if ( ss==se ) error("Could not parse: %s\n", line); - if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy; - if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy; + if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy; + if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy; // Special case, chr="*" stands for a default value if ( default_ploidy_def ) @@ -119,19 +119,32 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v return 0; } +static void _set_defaults(ploidy_t *ploidy, int dflt) +{ + int i; + if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i]; + for (i=0; insex; i++) + if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt; + + ploidy->dflt = dflt; + if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt; + if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt; +} + ploidy_t *ploidy_init(const char *fname, int dflt) { ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t)); if ( !pld ) return NULL; - pld->dflt = pld->min = pld->max = dflt; + pld->min = pld->max = -1; pld->sex2id = khash_str2int_init(); pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld); if ( !pld->idx ) { ploidy_destroy(pld); - pld = NULL; + return NULL; } + _set_defaults(pld,dflt); return pld; } @@ -140,7 +153,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t)); if ( !pld ) return NULL; - pld->dflt = pld->min = pld->max = dflt; + pld->min = pld->max = -1; pld->sex2id = khash_str2int_init(); pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld); @@ -160,6 +173,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) regidx_insert(pld->idx,NULL); free(tmp.s); + _set_defaults(pld,dflt); return pld; } diff --git a/bcftools/ploidy.c.pysam.c b/bcftools/ploidy.c.pysam.c index 4f567a3..d0468b9 100644 --- a/bcftools/ploidy.c.pysam.c +++ b/bcftools/ploidy.c.pysam.c @@ -1,7 +1,7 @@ #include "pysam.h" -/* - Copyright (C) 2014 Genome Research Ltd. +/* + Copyright (C) 2014-2016 Genome Research Ltd. Author: Petr Danecek @@ -100,7 +100,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s); sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]); ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex); - ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt; + ploidy->sex2dflt[ploidy->nsex-1] = -1; } ss = se; @@ -108,8 +108,8 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v if ( !*se ) error("Could not parse: %s\n", line); sp->ploidy = strtol(ss,&se,10); if ( ss==se ) error("Could not parse: %s\n", line); - if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy; - if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy; + if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy; + if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy; // Special case, chr="*" stands for a default value if ( default_ploidy_def ) @@ -121,19 +121,32 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v return 0; } +static void _set_defaults(ploidy_t *ploidy, int dflt) +{ + int i; + if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i]; + for (i=0; insex; i++) + if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt; + + ploidy->dflt = dflt; + if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt; + if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt; +} + ploidy_t *ploidy_init(const char *fname, int dflt) { ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t)); if ( !pld ) return NULL; - pld->dflt = pld->min = pld->max = dflt; + pld->min = pld->max = -1; pld->sex2id = khash_str2int_init(); pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld); if ( !pld->idx ) { ploidy_destroy(pld); - pld = NULL; + return NULL; } + _set_defaults(pld,dflt); return pld; } @@ -142,7 +155,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t)); if ( !pld ) return NULL; - pld->dflt = pld->min = pld->max = dflt; + pld->min = pld->max = -1; pld->sex2id = khash_str2int_init(); pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld); @@ -162,6 +175,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt) regidx_insert(pld->idx,NULL); free(tmp.s); + _set_defaults(pld,dflt); return pld; } diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c index bad2478..a59ec44 100644 --- a/bcftools/prob1.c.pysam.c +++ b/bcftools/prob1.c.pysam.c @@ -128,7 +128,7 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; if (b->M != b->n * 2) { - fprintf(pysamerr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); + fprintf(pysam_stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); return -1; } b->n1 = n1; @@ -523,9 +523,9 @@ int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1 void bcf_p1_dump_afs(bcf_p1aux_t *ma) { int k; - fprintf(pysamerr, "[afs]"); + fprintf(pysam_stderr, "[afs]"); for (k = 0; k <= ma->M; ++k) - fprintf(pysamerr, " %d:%.3lf", k, ma->afs[ma->M - k]); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); + fprintf(pysam_stderr, "\n"); memset(ma->afs, 0, sizeof(double) * (ma->M + 1)); } diff --git a/bcftools/pysam.h b/bcftools/pysam.h index 008cbbd..b0fc4fb 100644 --- a/bcftools/pysam.h +++ b/bcftools/pysam.h @@ -1,5 +1,7 @@ #ifndef PYSAM_H #define PYSAM_H #include "stdio.h" -extern FILE * pysamerr; +extern FILE * pysam_stderr; +extern FILE * pysam_stdout; +extern const char * pysam_stdout_fn; #endif diff --git a/bcftools/tabix.c.pysam.c b/bcftools/tabix.c.pysam.c index 0eb328f..afa3619 100644 --- a/bcftools/tabix.c.pysam.c +++ b/bcftools/tabix.c.pysam.c @@ -52,24 +52,24 @@ int main_tabix(int argc, char *argv[]) else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; else { - fprintf(pysamerr, "The type '%s' not recognised\n", optarg); + fprintf(pysam_stderr, "The type '%s' not recognised\n", optarg); return 1; } } if (optind == argc) { - fprintf(pysamerr, "\nUsage: bcftools tabix [options] [reg1 [...]]\n\n"); - fprintf(pysamerr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); - fprintf(pysamerr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); - fprintf(pysamerr, " -b INT column number for region start [4]\n"); - fprintf(pysamerr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); - fprintf(pysamerr, " -0 specify coordinates are zero-based\n"); - fprintf(pysamerr, " -S INT skip first INT lines [0]\n"); - fprintf(pysamerr, " -c CHAR skip lines starting with CHAR [null]\n"); - fprintf(pysamerr, " -a print all records\n"); - fprintf(pysamerr, " -f force to overwrite existing index\n"); - fprintf(pysamerr, " -m INT set the minimal interval size to 1< [reg1 [...]]\n\n"); + fprintf(pysam_stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); + fprintf(pysam_stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); + fprintf(pysam_stderr, " -b INT column number for region start [4]\n"); + fprintf(pysam_stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); + fprintf(pysam_stderr, " -0 specify coordinates are zero-based\n"); + fprintf(pysam_stderr, " -S INT skip first INT lines [0]\n"); + fprintf(pysam_stderr, " -c CHAR skip lines starting with CHAR [null]\n"); + fprintf(pysam_stderr, " -a print all records\n"); + fprintf(pysam_stderr, " -f force to overwrite existing index\n"); + fprintf(pysam_stderr, " -m INT set the minimal interval size to 1<= 0) puts(s.s); + while (bgzf_getline(fp, '\n', &s) >= 0) fputs(s.s, pysam_stdout) & fputc('\n', pysam_stdout); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index @@ -100,13 +100,13 @@ int main_tabix(int argc, char *argv[]) strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); - fprintf(pysamerr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); + fprintf(pysam_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { - fprintf(pysamerr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); + fprintf(pysam_stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access @@ -120,7 +120,7 @@ int main_tabix(int argc, char *argv[]) for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; - while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s); + while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) fputs(s.s, pysam_stdout) & fputc('\n', pysam_stdout); tbx_itr_destroy(itr); } free(s.s); diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index 96a1649..d5164f3 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -120,7 +120,7 @@ typedef struct _args_t char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; - int argc, drop_header, tgts_is_vcf, mark_sites_logic; + int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic; } args_t; @@ -809,6 +809,135 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out)); } } +static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) +{ + int i, nmax = 0; + for (i=icol_beg; icols[i], *end = str; + if ( str[0]=='.' && !str[1] ) + { + // missing value + if ( !nmax ) nmax = 1; + continue; + } + int n = 1; + while ( *end ) + { + if ( *end==',' ) n++; + end++; + } + if ( nmaxhdr_out); + assert( col->icol+nsmpl <= tab->ncols ); + int nvals = count_vals(tab,col->icol,col->icol+nsmpl); + assert( nvals>0 ); + hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi); + + int icol = col->icol, ismpl; + for (ismpl=0; ismpltmpi + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + ptr[ival++] = bcf_int32_missing; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivalhdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals); +} +static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + int nsmpl = bcf_hdr_nsamples(args->hdr_out); + assert( col->icol+nsmpl <= tab->ncols ); + int nvals = count_vals(tab,col->icol,col->icol+nsmpl); + assert( nvals>0 ); + hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf); + + int icol = col->icol, ismpl; + for (ismpl=0; ismpltmpf + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + bcf_float_set_missing(ptr[ival]); + ival++; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivalhdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals); +} +static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + int nsmpl = bcf_hdr_nsamples(args->hdr_out); + assert( col->icol+nsmpl <= tab->ncols ); + + int i, max_len = 0; + for (i=col->icol; iicol+nsmpl; i++) + { + int len = strlen(tab->cols[i]); + if ( max_len < len ) max_len = len; + } + hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps); + + int icol = col->icol, ismpl; + for (ismpl=0; ismpltmps + ismpl*max_len; + char *str = tab->cols[icol]; + i = 0; + while ( str[i] ) + { + ptr[i] = str[i]; + i++; + } + while ( ihdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len); +} static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; @@ -1127,7 +1256,7 @@ static void init_columns(args_t *args) kstring_t str = {0,0,0}, tmp = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; - int i = -1, has_fmt_str = 0, force_samples = -1; + int icol = -1, has_fmt_str = 0, force_samples = -1; while ( *ss ) { if ( *se && *se!=',' ) { se++; continue; } @@ -1135,22 +1264,22 @@ static void init_columns(args_t *args) if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; } else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; } else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; } - i++; + icol++; str.l = 0; kputsn(ss, se-ss, &str); if ( !str.s[0] || !strcasecmp("-",str.s) ) ; - else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i; - else if ( !strcasecmp("POS",str.s) ) args->from_idx = i; - else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i; - else if ( !strcasecmp("TO",str.s) ) args->to_idx = i; - else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i; - else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i; + else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; + else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol; + else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol; + else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol; + else if ( !strcasecmp("REF",str.s) ) args->ref_idx = icol; + else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol; else if ( !strcasecmp("ID",str.s) ) { if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key = strdup(str.s); @@ -1160,7 +1289,7 @@ static void init_columns(args_t *args) if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter; col->hdr_key = strdup(str.s); @@ -1187,7 +1316,7 @@ static void init_columns(args_t *args) if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual; col->hdr_key = strdup(str.s); @@ -1262,30 +1391,38 @@ static void init_columns(args_t *args) } else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) ) { - if ( !args->tgts_is_vcf ) - error("Error: FORMAT fields can be carried over from a VCF file only.\n"); - char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7); if ( force_samples<0 ) force_samples = replace; - if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL); - tmp.l = 0; - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); - bcf_hdr_sync(args->hdr_out); + if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace; + if ( args->tgts_is_vcf ) + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL); + tmp.l = 0; + bcf_hrec_format(hrec, &tmp); + bcf_hdr_append(args->hdr_out, tmp.s); + bcf_hdr_sync(args->hdr_out); + } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key); + if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) + error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = -1; + if ( !args->tgts_is_vcf ) + { + col->icol = icol; + icol += bcf_hdr_nsamples(args->hdr_out) - 1; + } + else + col->icol = -1; col->replace = replace; col->hdr_key = strdup(key); if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt; else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { - case BCF_HT_INT: col->setter = vcf_setter_format_int; break; - case BCF_HT_REAL: col->setter = vcf_setter_format_real; break; - case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break; + case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_format_int : setter_format_int; break; + case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break; + case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id)); } } @@ -1314,7 +1451,7 @@ static void init_columns(args_t *args) args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->hdr_key = strdup(str.s); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); @@ -1338,11 +1475,12 @@ static void init_columns(args_t *args) if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt); if ( has_fmt_str ) { - int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header); + int n = bcf_hdr_nsamples(args->hdr_out); + if ( args->tgts_is_vcf && nfiles->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header); args->tmpp = (char**)malloc(sizeof(char*)*n); args->tmpp2 = (char**)malloc(sizeof(char*)*n); } - if ( force_samples>=0 ) + if ( force_samples>=0 && args->tgts_is_vcf ) set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1); } @@ -1419,7 +1557,7 @@ static void init_data(args_t *args) args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } - bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); @@ -1517,8 +1655,10 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } if ( args->ref_idx != -1 ) { - assert( args->ref_idx < tmp->ncols ); - assert( args->alt_idx < tmp->ncols ); + if ( args->ref_idx >= tmp->ncols ) + error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s); + if ( args->alt_idx >= tmp->ncols ) + error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s); tmp->nals = 2; hts_expand(char*,tmp->nals,tmp->mals,tmp->als); tmp->als[0] = tmp->cols[args->ref_idx]; @@ -1624,9 +1764,10 @@ static void usage(args_t *args) fprintf(stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(stderr, " -I, --set-id [+] set ID column, see man pagee for details\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man pagee for details)\n"); + fprintf(stderr, " -I, --set-id [+] set ID column, see man page for details\n"); + fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); @@ -1649,6 +1790,7 @@ int main_vcfannotate(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; args->set_ids_replace = 1; int regions_is_file = 0; @@ -1671,6 +1813,7 @@ int main_vcfannotate(int argc, char *argv[]) {"header-lines",required_argument,NULL,'h'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0) @@ -1705,6 +1848,7 @@ int main_vcfannotate(int argc, char *argv[]) case 'h': args->header_fname = optarg; break; case 1 : args->rename_chrs = optarg; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index 1d86dbe..ea8398c 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2,7 +2,7 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -122,7 +122,7 @@ typedef struct _args_t char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; - int argc, drop_header, tgts_is_vcf, mark_sites_logic; + int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic; } args_t; @@ -265,7 +265,7 @@ static void init_remove_annots(args_t *args) int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s); if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) ) { - fprintf(pysamerr,"Warning: The tag \"%s\" not defined in the header\n", str.s); + fprintf(pysam_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); args->nrm--; } else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) @@ -811,6 +811,135 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out)); } } +static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) +{ + int i, nmax = 0; + for (i=icol_beg; icols[i], *end = str; + if ( str[0]=='.' && !str[1] ) + { + // missing value + if ( !nmax ) nmax = 1; + continue; + } + int n = 1; + while ( *end ) + { + if ( *end==',' ) n++; + end++; + } + if ( nmaxhdr_out); + assert( col->icol+nsmpl <= tab->ncols ); + int nvals = count_vals(tab,col->icol,col->icol+nsmpl); + assert( nvals>0 ); + hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi); + + int icol = col->icol, ismpl; + for (ismpl=0; ismpltmpi + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + ptr[ival++] = bcf_int32_missing; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivalhdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals); +} +static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + int nsmpl = bcf_hdr_nsamples(args->hdr_out); + assert( col->icol+nsmpl <= tab->ncols ); + int nvals = count_vals(tab,col->icol,col->icol+nsmpl); + assert( nvals>0 ); + hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf); + + int icol = col->icol, ismpl; + for (ismpl=0; ismpltmpf + ismpl*nvals; + int ival = 0; + + char *str = tab->cols[icol]; + while ( *str ) + { + if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value + { + bcf_float_set_missing(ptr[ival]); + ival++; + str += str[1] ? 2 : 1; + continue; + } + + char *end = str; + ptr[ival] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); + + ival++; + str = *end ? end+1 : end; + } + while ( ivalhdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals); +} +static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + int nsmpl = bcf_hdr_nsamples(args->hdr_out); + assert( col->icol+nsmpl <= tab->ncols ); + + int i, max_len = 0; + for (i=col->icol; iicol+nsmpl; i++) + { + int len = strlen(tab->cols[i]); + if ( max_len < len ) max_len = len; + } + hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps); + + int icol = col->icol, ismpl; + for (ismpl=0; ismpltmps + ismpl*max_len; + char *str = tab->cols[icol]; + i = 0; + while ( str[i] ) + { + ptr[i] = str[i]; + i++; + } + while ( ihdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len); +} static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; @@ -1010,7 +1139,7 @@ static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_s return; // the same samples in both files if ( !nmatch ) error("No matching samples found in the source and the destination file\n"); - if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysamerr,"%d sample(s) in common\n", nmatch); + if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysam_stderr,"%d sample(s) in common\n", nmatch); args->nsample_map = bcf_hdr_nsamples(dst); args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); @@ -1129,7 +1258,7 @@ static void init_columns(args_t *args) kstring_t str = {0,0,0}, tmp = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; - int i = -1, has_fmt_str = 0, force_samples = -1; + int icol = -1, has_fmt_str = 0, force_samples = -1; while ( *ss ) { if ( *se && *se!=',' ) { se++; continue; } @@ -1137,22 +1266,22 @@ static void init_columns(args_t *args) if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; } else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; } else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; } - i++; + icol++; str.l = 0; kputsn(ss, se-ss, &str); if ( !str.s[0] || !strcasecmp("-",str.s) ) ; - else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i; - else if ( !strcasecmp("POS",str.s) ) args->from_idx = i; - else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i; - else if ( !strcasecmp("TO",str.s) ) args->to_idx = i; - else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i; - else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i; + else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; + else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol; + else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol; + else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol; + else if ( !strcasecmp("REF",str.s) ) args->ref_idx = icol; + else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol; else if ( !strcasecmp("ID",str.s) ) { if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key = strdup(str.s); @@ -1162,7 +1291,7 @@ static void init_columns(args_t *args) if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter; col->hdr_key = strdup(str.s); @@ -1189,7 +1318,7 @@ static void init_columns(args_t *args) if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual; col->hdr_key = strdup(str.s); @@ -1264,30 +1393,38 @@ static void init_columns(args_t *args) } else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) ) { - if ( !args->tgts_is_vcf ) - error("Error: FORMAT fields can be carried over from a VCF file only.\n"); - char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7); if ( force_samples<0 ) force_samples = replace; - if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL); - tmp.l = 0; - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); - bcf_hdr_sync(args->hdr_out); + if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace; + if ( args->tgts_is_vcf ) + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL); + tmp.l = 0; + bcf_hrec_format(hrec, &tmp); + bcf_hdr_append(args->hdr_out, tmp.s); + bcf_hdr_sync(args->hdr_out); + } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key); + if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) + error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = -1; + if ( !args->tgts_is_vcf ) + { + col->icol = icol; + icol += bcf_hdr_nsamples(args->hdr_out) - 1; + } + else + col->icol = -1; col->replace = replace; col->hdr_key = strdup(key); if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt; else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { - case BCF_HT_INT: col->setter = vcf_setter_format_int; break; - case BCF_HT_REAL: col->setter = vcf_setter_format_real; break; - case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break; + case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_format_int : setter_format_int; break; + case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break; + case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id)); } } @@ -1316,7 +1453,7 @@ static void init_columns(args_t *args) args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = i; + col->icol = icol; col->replace = replace; col->hdr_key = strdup(str.s); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); @@ -1340,11 +1477,12 @@ static void init_columns(args_t *args) if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt); if ( has_fmt_str ) { - int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header); + int n = bcf_hdr_nsamples(args->hdr_out); + if ( args->tgts_is_vcf && nfiles->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header); args->tmpp = (char**)malloc(sizeof(char*)*n); args->tmpp2 = (char**)malloc(sizeof(char*)*n); } - if ( force_samples>=0 ) + if ( force_samples>=0 && args->tgts_is_vcf ) set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1); } @@ -1421,7 +1559,7 @@ static void init_data(args_t *args) args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } - bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); @@ -1519,8 +1657,10 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } if ( args->ref_idx != -1 ) { - assert( args->ref_idx < tmp->ncols ); - assert( args->alt_idx < tmp->ncols ); + if ( args->ref_idx >= tmp->ncols ) + error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s); + if ( args->alt_idx >= tmp->ncols ) + error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s); tmp->nals = 2; hts_expand(char*,tmp->nals,tmp->mals,tmp->als); tmp->als[0] = tmp->cols[args->ref_idx]; @@ -1617,28 +1757,29 @@ static void annotate(args_t *args, bcf1_t *line) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Annotate and edit VCF/BCF files.\n"); - fprintf(pysamerr, "Usage: bcftools annotate [options] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(pysamerr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(pysamerr, " -I, --set-id [+] set ID column, see man pagee for details\n"); - fprintf(pysamerr, " -i, --include select sites for which the expression is true (see man pagee for details)\n"); - fprintf(pysamerr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(pysamerr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(pysamerr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(pysamerr, " -x, --remove list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Annotate and edit VCF/BCF files.\n"); + fprintf(pysam_stderr, "Usage: bcftools annotate [options] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(pysam_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); + fprintf(pysam_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); + fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); + fprintf(pysam_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(pysam_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(pysam_stderr, " -x, --remove list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -1651,6 +1792,7 @@ int main_vcfannotate(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; args->set_ids_replace = 1; int regions_is_file = 0; @@ -1673,6 +1815,7 @@ int main_vcfannotate(int argc, char *argv[]) {"header-lines",required_argument,NULL,'h'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0) @@ -1707,6 +1850,7 @@ int main_vcfannotate(int argc, char *argv[]) case 'h': args->header_fname = optarg; break; case 1 : args->rename_chrs = optarg; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index a28caee..e5bbf11 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -68,7 +68,7 @@ void error(const char *format, ...); typedef struct { int flag; // combination of CF_* flags above - int output_type, n_threads; + int output_type, n_threads, record_cmd_line; htsFile *bcf_in, *out_fh; char *bcf_fname, *output_fname; char **samples; // for subsampling and ploidy @@ -175,6 +175,11 @@ static ploidy_predef_t ploidy_predefs[] = "* * * M 1\n" "* * * F 0\n" }, + { .alias = "1", + .about = "Treat all samples as haploid", + .ploidy = + "* * * * 1\n" + }, { .alias = NULL, .about = NULL, @@ -381,7 +386,7 @@ static void init_data(args_t *args) if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) - error("Failed to read the targets: %s\n", args->regions); + error("Failed to read the regions: %s\n", args->regions); } if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); @@ -396,9 +401,21 @@ static void init_data(args_t *args) if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname); fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); } + } + if ( args->ploidy ) + { args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int)); args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int)); + if ( !args->nsamples ) + { + args->nsamples = bcf_hdr_nsamples(args->aux.hdr); + args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples); + for (i=0; insamples; i++) args->sample2sex[i] = 0; + } + } + if ( args->nsamples ) + { args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; insamples; i++) args->aux.ploidy[i] = 2; for (i=0; insex; i++) args->sex2ploidy_prev[i] = 2; @@ -418,9 +435,12 @@ static void init_data(args_t *args) else { args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0)); - for (i=0; insamples; i++) - if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) - error("No such sample: %s\n", args->samples[i]); + if ( args->samples ) + { + for (i=0; insamples; i++) + if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) + error("No such sample: %s\n", args->samples[i]); + } } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); @@ -439,7 +459,7 @@ static void init_data(args_t *args) bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); - bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); + if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); @@ -451,7 +471,10 @@ static void destroy_data(args_t *args) else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); int i; - for (i=0; insamples; i++) free(args->samples[i]); + if ( args->samples ) + { + for (i=0; insamples; i++) free(args->samples[i]); + } if ( args->aux.fams ) { for (i=0; iaux.nfams; i++) free(args->aux.fams[i].name); @@ -579,6 +602,7 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools call [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "File format options:\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); fprintf(stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); @@ -634,6 +658,7 @@ int main_vcfcall(int argc, char *argv[]) args.output_fname = "-"; args.output_type = FT_VCF; args.n_threads = 0; + args.record_cmd_line = 1; args.aux.trio_Pm_SNPs = 1 - 1e-8; args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9; @@ -668,6 +693,7 @@ int main_vcfcall(int argc, char *argv[]) {"ploidy-file",required_argument,NULL,2}, {"chromosome-X",no_argument,NULL,'X'}, {"chromosome-Y",no_argument,NULL,'Y'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; @@ -727,6 +753,7 @@ int main_vcfcall(int argc, char *argv[]) case 's': args.samples_fname = optarg; break; case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break; case 9 : args.n_threads = strtol(optarg, 0, 0); break; + case 8 : args.record_cmd_line = 0; break; default: usage(&args); } } diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 9e8c1bb..8e59fd9 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -2,7 +2,7 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -70,7 +70,7 @@ void error(const char *format, ...); typedef struct { int flag; // combination of CF_* flags above - int output_type, n_threads; + int output_type, n_threads, record_cmd_line; htsFile *bcf_in, *out_fh; char *bcf_fname, *output_fname; char **samples; // for subsampling and ploidy @@ -177,6 +177,11 @@ static ploidy_predef_t ploidy_predefs[] = "* * * M 1\n" "* * * F 0\n" }, + { .alias = "1", + .about = "Treat all samples as haploid", + .ploidy = + "* * * * 1\n" + }, { .alias = NULL, .about = NULL, @@ -290,7 +295,7 @@ static void set_samples(args_t *args, const char *fn, int is_file) char x = *se, *xptr = se; *se = 0; int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss); - if ( ismpl < 0 ) { fprintf(pysamerr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + if ( ismpl < 0 ) { fprintf(pysam_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } ss = se+1; while ( *ss && isspace(*ss) ) ss++; @@ -383,7 +388,7 @@ static void init_data(args_t *args) if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) - error("Failed to read the targets: %s\n", args->regions); + error("Failed to read the regions: %s\n", args->regions); } if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); @@ -396,11 +401,23 @@ static void init_data(args_t *args) if ( args->aux.flag&CALL_CONSTR_TRIO ) { if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname); - fprintf(pysamerr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); + fprintf(pysam_stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); } + } + if ( args->ploidy ) + { args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int)); args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int)); + if ( !args->nsamples ) + { + args->nsamples = bcf_hdr_nsamples(args->aux.hdr); + args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples); + for (i=0; insamples; i++) args->sample2sex[i] = 0; + } + } + if ( args->nsamples ) + { args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; insamples; i++) args->aux.ploidy[i] = 2; for (i=0; insex; i++) args->sex2ploidy_prev[i] = 2; @@ -420,9 +437,12 @@ static void init_data(args_t *args) else { args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0)); - for (i=0; insamples; i++) - if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) - error("No such sample: %s\n", args->samples[i]); + if ( args->samples ) + { + for (i=0; insamples; i++) + if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) + error("No such sample: %s\n", args->samples[i]); + } } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); @@ -441,7 +461,7 @@ static void init_data(args_t *args) bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); - bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); + if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); @@ -453,7 +473,10 @@ static void destroy_data(args_t *args) else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); int i; - for (i=0; insamples; i++) free(args->samples[i]); + if ( args->samples ) + { + for (i=0; insamples; i++) free(args->samples[i]); + } if ( args->aux.fams ) { for (i=0; iaux.nfams; i++) free(args->aux.fams[i].name); @@ -507,7 +530,7 @@ static int parse_format_flag(const char *str) else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP; else { - fprintf(pysamerr,"Could not parse \"%s\"\n", str); + fprintf(pysam_stderr,"Could not parse \"%s\"\n", str); exit(1); } if ( !*se ) break; @@ -548,23 +571,23 @@ ploidy_t *init_ploidy(char *alias) if ( !pld->alias ) { - fprintf(pysamerr,"Predefined ploidies:\n"); + fprintf(pysam_stderr,"Predefined ploidies:\n"); pld = ploidy_predefs; while ( pld->alias ) { - fprintf(pysamerr,"%s\n .. %s\n\n", pld->alias,pld->about); + fprintf(pysam_stderr,"%s\n .. %s\n\n", pld->alias,pld->about); if ( detailed ) - fprintf(pysamerr,"%s\n", pld->ploidy); + fprintf(pysam_stderr,"%s\n", pld->ploidy); pld++; } - fprintf(pysamerr,"Run as --ploidy (e.g. --ploidy GRCh37).\n"); - fprintf(pysamerr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n"); - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"Run as --ploidy (e.g. --ploidy GRCh37).\n"); + fprintf(pysam_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n"); + fprintf(pysam_stderr,"\n"); exit(-1); } else if ( detailed ) { - fprintf(pysamerr,"%s", pld->ploidy); + fprintf(pysam_stderr,"%s", pld->ploidy); exit(-1); } return ploidy_init_string(pld->ploidy,2); @@ -572,51 +595,52 @@ ploidy_t *init_ploidy(char *alias) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); - fprintf(pysamerr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); - fprintf(pysamerr, " functionality has been temporarily lost in the process of transition to htslib,\n"); - fprintf(pysamerr, " but will be added back on popular demand. The original calling model can be\n"); - fprintf(pysamerr, " invoked with the -c option.\n"); - fprintf(pysamerr, "Usage: bcftools call [options] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "File format options:\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysamerr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); - fprintf(pysamerr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --samples list of samples to include [all samples]\n"); - fprintf(pysamerr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Input/output options:\n"); - fprintf(pysamerr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(pysamerr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(pysamerr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(pysamerr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(pysamerr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); - fprintf(pysamerr, " -V, --skip-variants skip indels/snps\n"); - fprintf(pysamerr, " -v, --variants-only output variant sites only\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Consensus/variant calling options:\n"); - fprintf(pysamerr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); - fprintf(pysamerr, " -C, --constrain one of: alleles, trio (see manual)\n"); - fprintf(pysamerr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(pysamerr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(pysamerr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity) [1.1e-3]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); + fprintf(pysam_stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); + fprintf(pysam_stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); + fprintf(pysam_stderr, " but will be added back on popular demand. The original calling model can be\n"); + fprintf(pysam_stderr, " invoked with the -c option.\n"); + fprintf(pysam_stderr, "Usage: bcftools call [options] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "File format options:\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); + fprintf(pysam_stderr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --samples list of samples to include [all samples]\n"); + fprintf(pysam_stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Input/output options:\n"); + fprintf(pysam_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); + fprintf(pysam_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); + fprintf(pysam_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(pysam_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); + fprintf(pysam_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); + fprintf(pysam_stderr, " -V, --skip-variants skip indels/snps\n"); + fprintf(pysam_stderr, " -v, --variants-only output variant sites only\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Consensus/variant calling options:\n"); + fprintf(pysam_stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); + fprintf(pysam_stderr, " -C, --constrain one of: alleles, trio (see manual)\n"); + fprintf(pysam_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); + fprintf(pysam_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); + fprintf(pysam_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity) [1.1e-3]\n"); // todo (and more) - // fprintf(pysamerr, "\nContrast calling and association test options:\n"); - // fprintf(pysamerr, " -1 INT number of group-1 samples [0]\n"); - // fprintf(pysamerr, " -C FLOAT posterior constrast for LRTaux.min_lrt); - // fprintf(pysamerr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); - // fprintf(pysamerr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); - fprintf(pysamerr, "\n"); + // fprintf(pysam_stderr, "\nContrast calling and association test options:\n"); + // fprintf(pysam_stderr, " -1 INT number of group-1 samples [0]\n"); + // fprintf(pysam_stderr, " -C FLOAT posterior constrast for LRTaux.min_lrt); + // fprintf(pysam_stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); + // fprintf(pysam_stderr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); + fprintf(pysam_stderr, "\n"); exit(-1); } @@ -636,6 +660,7 @@ int main_vcfcall(int argc, char *argv[]) args.output_fname = "-"; args.output_type = FT_VCF; args.n_threads = 0; + args.record_cmd_line = 1; args.aux.trio_Pm_SNPs = 1 - 1e-8; args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9; @@ -670,6 +695,7 @@ int main_vcfcall(int argc, char *argv[]) {"ploidy-file",required_argument,NULL,2}, {"chromosome-X",no_argument,NULL,'X'}, {"chromosome-Y",no_argument,NULL,'Y'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; @@ -680,8 +706,8 @@ int main_vcfcall(int argc, char *argv[]) { case 2 : ploidy_fname = optarg; break; case 1 : ploidy = optarg; break; - case 'X': ploidy = "X"; fprintf(pysamerr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; - case 'Y': ploidy = "Y"; fprintf(pysamerr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; + case 'X': ploidy = "X"; fprintf(pysam_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; + case 'Y': ploidy = "Y"; fprintf(pysam_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) @@ -729,6 +755,7 @@ int main_vcfcall(int argc, char *argv[]) case 's': args.samples_fname = optarg; break; case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break; case 9 : args.n_threads = strtol(optarg, 0, 0); break; + case 8 : args.record_cmd_line = 0; break; default: usage(&args); } } @@ -745,7 +772,7 @@ int main_vcfcall(int argc, char *argv[]) if ( !ploidy_fname && !ploidy ) { - fprintf(pysamerr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n"); + fprintf(pysam_stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n"); args.ploidy = ploidy_init_string("",2); } diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index d8a1ca5..10a00b9 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -267,7 +267,7 @@ static void init_data(args_t *args) args->hmm = hmm_init(args->nstates, args->tprob, 10000); hmm_init_states(args->hmm, args->iprobs); - args->summary_fh = stdout; + args->summary_fh = pysam_stdout; if ( args->output_dir ) { init_sample_files(&args->query_sample, args->output_dir); @@ -306,7 +306,7 @@ static void py_plot_cnv(char *script, float th) char *cmd = msprintf("python %s -p %f", script, th); int ret = system(cmd); - if ( ret) fprintf(pysamerr, "The command returned non-zero status %d: %s\n", ret, cmd); + if ( ret) fprintf(pysam_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); free(cmd); } @@ -641,7 +641,7 @@ static int set_observed_prob(args_t *args, sample_t *smpl, int isite) cn3_baf /= norm; #if DBG0 - if ( args->verbose ) fprintf(pysamerr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf); + if ( args->verbose ) fprintf(pysam_stderr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf); #endif double cn1_lrr = exp(-(lrr + 0.45)*(lrr + 0.45)/smpl->lrr_dev2); @@ -866,7 +866,7 @@ static int update_sample_args(args_t *args, sample_t *smpl, int ismpl) baf_AA_dev2 /= norm_baf_AA_dev2; if ( baf_dev2 < baf_AA_dev2 ) baf_dev2 = baf_AA_dev2; double max_mean_cn3 = 0.5 - sqrt(baf_dev2)*1.644854; // R: qnorm(0.95)=1.644854 - //fprintf(pysamerr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3); + //fprintf(pysam_stderr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3); assert( max_mean_cn3>0 ); double new_frac = 1./mean_cn3 - 2; @@ -936,13 +936,13 @@ static void cnv_flush_viterbi(args_t *args) if ( args->optimize_frac ) { int niter = 0; - fprintf(pysamerr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid)); + fprintf(pysam_stderr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid)); do { - fprintf(pysamerr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); + fprintf(pysam_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); if ( args->control_sample.name ) - fprintf(pysamerr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); + fprintf(pysam_stderr,"\n"); set_emission_probs(args); hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites); } @@ -958,10 +958,10 @@ static void cnv_flush_viterbi(args_t *args) if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample); } - fprintf(pysamerr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); + fprintf(pysam_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); if ( args->control_sample.name ) - fprintf(pysamerr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); + fprintf(pysam_stderr,"\n"); } set_emission_probs(args); @@ -971,7 +971,7 @@ static void cnv_flush_viterbi(args_t *args) double ori_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm)); hmm_run_baum_welch(hmm, args->nsites, args->eprob, args->sites); double new_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm)); - fprintf(pysamerr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii); + fprintf(pysam_stderr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii); double *tprob = init_tprob_matrix(nstates, 1-new_ii, args->same_prob); hmm_set_tprob(args->hmm, tprob, 10000); double *tprob_arr = hmm_get_tprob(hmm); @@ -983,9 +983,9 @@ static void cnv_flush_viterbi(args_t *args) { for (j=0; j\n"); - fprintf(pysamerr, "General Options:\n"); - fprintf(pysamerr, " -c, --control-sample optional control sample name to highlight differences\n"); - fprintf(pysamerr, " -f, --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(pysamerr, " -o, --output-dir \n"); - fprintf(pysamerr, " -p, --plot-threshold plot aberrant chromosomes with quality at least 'float'\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --query-sample query samply name\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, "HMM Options:\n"); - fprintf(pysamerr, " -a, --aberrant fraction of aberrant cells in query and control [1.0,1.0]\n"); - fprintf(pysamerr, " -b, --BAF-weight relative contribution from BAF [1]\n"); - fprintf(pysamerr, " -d, --BAF-dev expected BAF deviation in query and control [0.04,0.04]\n"); // experimental - fprintf(pysamerr, " -e, --err-prob uniform error probability [1e-4]\n"); - fprintf(pysamerr, " -k, --LRR-dev expected LRR deviation [0.2,0.2]\n"); // experimental - fprintf(pysamerr, " -l, --LRR-weight relative contribution from LRR [0.2]\n"); - fprintf(pysamerr, " -L, --LRR-smooth-win window of LRR moving average smoothing [10]\n"); - fprintf(pysamerr, " -O, --optimize estimate fraction of aberrant cells down to [1.0]\n"); - fprintf(pysamerr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); - fprintf(pysamerr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n"); + fprintf(pysam_stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n"); + fprintf(pysam_stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n"); + fprintf(pysam_stderr, "Usage: bcftools cnv [OPTIONS] \n"); + fprintf(pysam_stderr, "General Options:\n"); + fprintf(pysam_stderr, " -c, --control-sample optional control sample name to highlight differences\n"); + fprintf(pysam_stderr, " -f, --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(pysam_stderr, " -o, --output-dir \n"); + fprintf(pysam_stderr, " -p, --plot-threshold plot aberrant chromosomes with quality at least 'float'\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --query-sample query samply name\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, "HMM Options:\n"); + fprintf(pysam_stderr, " -a, --aberrant fraction of aberrant cells in query and control [1.0,1.0]\n"); + fprintf(pysam_stderr, " -b, --BAF-weight relative contribution from BAF [1]\n"); + fprintf(pysam_stderr, " -d, --BAF-dev expected BAF deviation in query and control [0.04,0.04]\n"); // experimental + fprintf(pysam_stderr, " -e, --err-prob uniform error probability [1e-4]\n"); + fprintf(pysam_stderr, " -k, --LRR-dev expected LRR deviation [0.2,0.2]\n"); // experimental + fprintf(pysam_stderr, " -l, --LRR-weight relative contribution from LRR [0.2]\n"); + fprintf(pysam_stderr, " -L, --LRR-smooth-win window of LRR moving average smoothing [10]\n"); + fprintf(pysam_stderr, " -O, --optimize estimate fraction of aberrant cells down to [1.0]\n"); + fprintf(pysam_stderr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); + fprintf(pysam_stderr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -1379,7 +1379,7 @@ int main_vcfcnv(int argc, char *argv[]) } cnv_next_line(args, NULL); create_plots(args); - fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); + fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); destroy_data(args); free(args); return 0; diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index cfec7c0..bd6a00a 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -31,13 +31,15 @@ THE SOFTWARE. */ #include #include #include +#include +#include // for hts_get_bgzfp() #include "bcftools.h" typedef struct _args_t { bcf_srs_t *files; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, record_cmd_line; bcf_hdr_t *out_hdr; int *seen_seq; @@ -50,7 +52,7 @@ typedef struct _args_t char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; - int compact_PS, phase_set_changed; + int compact_PS, phase_set_changed, naive_concat; } args_t; @@ -106,7 +108,7 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); bcf_hdr_append(args->out_hdr,"##FORMAT="); } - bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -176,8 +178,11 @@ static void destroy_data(args_t *args) for (i=0; infnames; i++) free(args->fnames[i]); free(args->fnames); if ( args->files ) bcf_sr_destroy(args->files); - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); - bcf_hdr_destroy(args->out_hdr); + if ( args->out_fh ) + { + if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); + } + if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr); free(args->seen_seq); free(args->start_pos); free(args->swap_phase); @@ -550,6 +555,108 @@ static void concat(args_t *args) } } +static void naive_concat(args_t *args) +{ + // only compressed BCF atm + BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; + + const size_t page_size = 32768; + char *buf = (char*) malloc(page_size); + kstring_t tmp = {0,0,0}; + int i; + for (i=0; infnames; i++) + { + htsFile *hts_fp = hts_open(args->fnames[i],"r"); + if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); + htsFormat type = *hts_get_format(hts_fp); + + if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); + if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); + + BGZF *fp = hts_get_bgzfp(hts_fp); + if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) + error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); + + uint8_t magic[5]; + if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); + + if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + hts_expand(char,tmp.l,tmp.m,tmp.s); + if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + + // write only the first header + if ( i==0 ) + { + if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); + if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); + if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); + } + + // Output all non-header data that were read together with the header block + int nskip = fp->block_offset; + if ( fp->block_length - nskip > 0 ) + { + if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); + } + if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); + + + // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks + ssize_t nread, ncached = 0, nwr; + const int neof = 28; + char cached[neof]; + while (1) + { + nread = bgzf_raw_read(fp, buf, page_size); + + // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends + if ( nread<=0 ) break; + if ( nread<=neof ) // last block + { + if ( ncached ) + { + // flush the part of the cache that won't be needed + nwr = bgzf_raw_write(bgzf_out, cached, nread); + if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); + + // make space in the cache so that we can append to the end + if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); + } + + // fill the cache and check for eof outside this loop + memcpy(cached+neof-nread,buf,nread); + break; + } + + // not the last block, flush the cache if full + if ( ncached ) + { + nwr = bgzf_raw_write(bgzf_out, cached, ncached); + if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); + ncached = 0; + } + + // fill the cache + nread -= neof; + memcpy(cached,buf+nread,neof); + ncached = neof; + + nwr = bgzf_raw_write(bgzf_out, buf, nread); + if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); + } + if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) + { + nwr = bgzf_raw_write(bgzf_out, cached, neof); + if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); + } + if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); + } + free(buf); + free(tmp.s); + if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); +} + static void usage(args_t *args) { fprintf(stderr, "\n"); @@ -558,7 +665,9 @@ static void usage(args_t *args) fprintf(stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n"); fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); - fprintf(stderr, " the -a, --allow-overlaps option is specified.\n"); + fprintf(stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); + fprintf(stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); + fprintf(stderr, " if the BCF headers differ.\n"); fprintf(stderr, "Usage: bcftools concat [options] [ [...]]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); @@ -568,6 +677,8 @@ static void usage(args_t *args) fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n"); fprintf(stderr, " -f, --file-list Read the list of files from a file.\n"); fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); + fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n"); fprintf(stderr, " -o, --output Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); @@ -586,10 +697,12 @@ int main_vcfconcat(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->min_PQ = 30; static struct option loptions[] = { + {"naive",no_argument,NULL,'n'}, {"compact-PS",no_argument,NULL,'c'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, @@ -602,10 +715,11 @@ int main_vcfconcat(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"file-list",required_argument,NULL,'f'}, {"min-PQ",required_argument,NULL,'q'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0) { switch (c) { case 'c': args->compact_PS = 1; break; @@ -617,6 +731,7 @@ int main_vcfconcat(int argc, char *argv[]) args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; + case 'n': args->naive_concat = 1; break; case 'a': args->allow_overlaps = 1; break; case 'l': args->phased_concat = 1; break; case 'f': args->file_list = optarg; break; @@ -631,6 +746,7 @@ int main_vcfconcat(int argc, char *argv[]) }; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -654,6 +770,15 @@ int main_vcfconcat(int argc, char *argv[]) if ( !args->nfnames ) usage(args); if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n"); if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n"); + if ( args->naive_concat ) + { + if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n"); + if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n"); + naive_concat(args); + destroy_data(args); + free(args); + return 0; + } init_data(args); concat(args); destroy_data(args); diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index 40db3f7..be2d6a2 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -33,13 +33,15 @@ THE SOFTWARE. */ #include #include #include +#include +#include // for hts_get_bgzfp() #include "bcftools.h" typedef struct _args_t { bcf_srs_t *files; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, record_cmd_line; bcf_hdr_t *out_hdr; int *seen_seq; @@ -52,7 +54,7 @@ typedef struct _args_t char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; - int compact_PS, phase_set_changed; + int compact_PS, phase_set_changed, naive_concat; } args_t; @@ -108,7 +110,7 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); bcf_hdr_append(args->out_hdr,"##FORMAT="); } - bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -178,8 +180,11 @@ static void destroy_data(args_t *args) for (i=0; infnames; i++) free(args->fnames[i]); free(args->fnames); if ( args->files ) bcf_sr_destroy(args->files); - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); - bcf_hdr_destroy(args->out_hdr); + if ( args->out_fh ) + { + if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); + } + if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr); free(args->seen_seq); free(args->start_pos); free(args->swap_phase); @@ -231,7 +236,7 @@ static void phased_flush(args_t *args) { if ( !gt_absent_warned ) { - fprintf(pysamerr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); + fprintf(pysam_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); gt_absent_warned = 1; } continue; @@ -242,7 +247,7 @@ static void phased_flush(args_t *args) { if ( !gt_absent_warned ) { - fprintf(pysamerr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); + fprintf(pysam_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); gt_absent_warned = 1; } continue; @@ -552,31 +557,137 @@ static void concat(args_t *args) } } +static void naive_concat(args_t *args) +{ + // only compressed BCF atm + BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; + + const size_t page_size = 32768; + char *buf = (char*) malloc(page_size); + kstring_t tmp = {0,0,0}; + int i; + for (i=0; infnames; i++) + { + htsFile *hts_fp = hts_open(args->fnames[i],"r"); + if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); + htsFormat type = *hts_get_format(hts_fp); + + if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); + if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); + + BGZF *fp = hts_get_bgzfp(hts_fp); + if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) + error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); + + uint8_t magic[5]; + if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); + + if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + hts_expand(char,tmp.l,tmp.m,tmp.s); + if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); + + // write only the first header + if ( i==0 ) + { + if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); + if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); + if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); + } + + // Output all non-header data that were read together with the header block + int nskip = fp->block_offset; + if ( fp->block_length - nskip > 0 ) + { + if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); + } + if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); + + + // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks + ssize_t nread, ncached = 0, nwr; + const int neof = 28; + char cached[neof]; + while (1) + { + nread = bgzf_raw_read(fp, buf, page_size); + + // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends + if ( nread<=0 ) break; + if ( nread<=neof ) // last block + { + if ( ncached ) + { + // flush the part of the cache that won't be needed + nwr = bgzf_raw_write(bgzf_out, cached, nread); + if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); + + // make space in the cache so that we can append to the end + if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); + } + + // fill the cache and check for eof outside this loop + memcpy(cached+neof-nread,buf,nread); + break; + } + + // not the last block, flush the cache if full + if ( ncached ) + { + nwr = bgzf_raw_write(bgzf_out, cached, ncached); + if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); + ncached = 0; + } + + // fill the cache + nread -= neof; + memcpy(cached,buf+nread,neof); + ncached = neof; + + nwr = bgzf_raw_write(bgzf_out, buf, nread); + if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); + } + if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) + { + nwr = bgzf_raw_write(bgzf_out, cached, neof); + if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); + } + if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); + } + free(buf); + free(tmp.s); + if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); +} + static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n"); - fprintf(pysamerr, " columns appearing in the same order. The program can be used, for example, to\n"); - fprintf(pysamerr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n"); - fprintf(pysamerr, " VCF into one. The input files must be sorted by chr and position. The files\n"); - fprintf(pysamerr, " must be given in the correct order to produce sorted VCF on output unless\n"); - fprintf(pysamerr, " the -a, --allow-overlaps option is specified.\n"); - fprintf(pysamerr, "Usage: bcftools concat [options] [ [...]]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); - fprintf(pysamerr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); - fprintf(pysamerr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); - fprintf(pysamerr, " -D, --remove-duplicates Alias for -d none\n"); - fprintf(pysamerr, " -f, --file-list Read the list of files from a file.\n"); - fprintf(pysamerr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); - fprintf(pysamerr, " -o, --output Write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysamerr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); - fprintf(pysamerr, " -r, --regions Restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file Restrict to regions listed in a file\n"); - fprintf(pysamerr, " --threads Number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n"); + fprintf(pysam_stderr, " columns appearing in the same order. The program can be used, for example, to\n"); + fprintf(pysam_stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n"); + fprintf(pysam_stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); + fprintf(pysam_stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); + fprintf(pysam_stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); + fprintf(pysam_stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); + fprintf(pysam_stderr, " if the BCF headers differ.\n"); + fprintf(pysam_stderr, "Usage: bcftools concat [options] [ [...]]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); + fprintf(pysam_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); + fprintf(pysam_stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); + fprintf(pysam_stderr, " -D, --remove-duplicates Alias for -d none\n"); + fprintf(pysam_stderr, " -f, --file-list Read the list of files from a file.\n"); + fprintf(pysam_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n"); + fprintf(pysam_stderr, " -o, --output Write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); + fprintf(pysam_stderr, " -r, --regions Restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file Restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " --threads Number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -588,10 +699,12 @@ int main_vcfconcat(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->min_PQ = 30; static struct option loptions[] = { + {"naive",no_argument,NULL,'n'}, {"compact-PS",no_argument,NULL,'c'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, @@ -604,10 +717,11 @@ int main_vcfconcat(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"file-list",required_argument,NULL,'f'}, {"min-PQ",required_argument,NULL,'q'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0) { switch (c) { case 'c': args->compact_PS = 1; break; @@ -619,6 +733,7 @@ int main_vcfconcat(int argc, char *argv[]) args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; + case 'n': args->naive_concat = 1; break; case 'a': args->allow_overlaps = 1; break; case 'l': args->phased_concat = 1; break; case 'f': args->file_list = optarg; break; @@ -633,6 +748,7 @@ int main_vcfconcat(int argc, char *argv[]) }; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -656,6 +772,15 @@ int main_vcfconcat(int argc, char *argv[]) if ( !args->nfnames ) usage(args); if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n"); if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n"); + if ( args->naive_concat ) + { + if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n"); + if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n"); + naive_concat(args); + destroy_data(args); + free(args); + return 0; + } init_data(args); concat(args); destroy_data(args); diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index 26166df..1e60d30 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -66,7 +66,7 @@ struct _args_t int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname; - int argc, n_threads; + int argc, n_threads, record_cmd_line; }; static void destroy_data(args_t *args) @@ -369,7 +369,7 @@ static void gensample_to_vcf(args_t *args) bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); @@ -489,7 +489,7 @@ static void haplegendsample_to_vcf(args_t *args) bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, nrows, nsamples; char **samples = hts_readlist(sample_fname, 1, &nrows); @@ -606,7 +606,7 @@ static void hapsample_to_vcf(args_t *args) bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); @@ -1143,7 +1143,7 @@ static void tsv_to_vcf(args_t *args) args->header = bcf_hdr_init("w"); bcf_hdr_set_chrs(args->header, args->ref); bcf_hdr_append(args->header, "##FORMAT="); - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, n; char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); @@ -1241,7 +1241,7 @@ static void gvcf_to_vcf(args_t *args) if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); - bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); bcf_hdr_write(out_fh,hdr); int32_t *itmp = NULL, nitmp = 0; @@ -1304,11 +1304,12 @@ static void usage(void) fprintf(stderr, " -S, --samples-file file of samples to include\n"); fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads number of extra output compression threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "VCF output options:\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output output file name [stdout]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(stderr, " --threads number of extra output compression threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(stderr, " -G, --gensample2vcf <...> |,\n"); @@ -1359,6 +1360,7 @@ int main_vcfconvert(int argc, char *argv[]) args->outfname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; static struct option loptions[] = { @@ -1387,6 +1389,7 @@ int main_vcfconvert(int argc, char *argv[]) {"haplegendsample2vcf",required_argument,NULL,'H'}, {"columns",required_argument,NULL,'c'}, {"fasta-ref",required_argument,NULL,'f'}, + {"no-version",no_argument,NULL,10}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { @@ -1424,6 +1427,7 @@ int main_vcfconvert(int argc, char *argv[]) break; case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 10 : args->record_cmd_line = 0; break; case '?': usage(); default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index 03b24b4..12333cc 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -68,7 +68,7 @@ struct _args_t int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname; - int argc, n_threads; + int argc, n_threads, record_cmd_line; }; static void destroy_data(args_t *args) @@ -211,13 +211,13 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) { float aa,ab,bb; aa = strtod(tsv->ss, &tsv->se); - if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse first value of %d-th sample\n", i+1); return -1; } + if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; } tsv->ss = tsv->se+1; ab = strtod(tsv->ss, &tsv->se); - if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse second value of %d-th sample\n", i+1); return -1; } + if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; } tsv->ss = tsv->se+1; bb = strtod(tsv->ss, &tsv->se); - if ( tsv->ss==tsv->se ) { fprintf(pysamerr,"Could not parse third value of %d-th sample\n", i+1); return -1; } + if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; } tsv->ss = tsv->se+1; if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; } @@ -263,7 +263,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) if ( !ss[0] || !ss[1] || !ss[2] || (up && (!ss[3] || !ss[4]) ) ) { - fprintf(pysamerr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]); + fprintf(pysam_stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]); return -1; } @@ -282,7 +282,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) args->gts[2*i+all] = bcf_int32_vector_end; break; default : - fprintf(pysamerr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); + fprintf(pysam_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; @@ -290,7 +290,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) if(up && up != 2) { - fprintf(pysamerr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); + fprintf(pysam_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); return -1; } @@ -304,8 +304,8 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) } if ( tsv->ss[(nsamples-1)*4+3+nup] ) { - fprintf(pysamerr,"nup: %d", nup); - fprintf(pysamerr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]); + fprintf(pysam_stderr,"nup: %d", nup); + fprintf(pysam_stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]); return -1; } @@ -371,7 +371,7 @@ static void gensample_to_vcf(args_t *args) bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); @@ -417,7 +417,7 @@ static void gensample_to_vcf(args_t *args) free(args->flt); tsv_destroy(tsv); - fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total); + fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total); } static void haplegendsample_to_vcf(args_t *args) @@ -491,7 +491,7 @@ static void haplegendsample_to_vcf(args_t *args) bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, nrows, nsamples; char **samples = hts_readlist(sample_fname, 1, &nrows); @@ -554,7 +554,7 @@ static void haplegendsample_to_vcf(args_t *args) tsv_destroy(hap_tsv); tsv_destroy(leg_tsv); - fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total); + fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total); } static void hapsample_to_vcf(args_t *args) @@ -608,7 +608,7 @@ static void hapsample_to_vcf(args_t *args) bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); @@ -653,7 +653,7 @@ static void hapsample_to_vcf(args_t *args) free(args->gts); tsv_destroy(tsv); - fprintf(pysamerr,"Number of processed rows: \t%d\n", args->n.total); + fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total); } static void vcf_to_gensample(args_t *args) @@ -710,8 +710,8 @@ static void vcf_to_gensample(args_t *args) if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0; if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0; - if (gen_fname) fprintf(pysamerr, "Gen file: %s\n", gen_fname); - if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname); + if (gen_fname) fprintf(pysam_stderr, "Gen file: %s\n", gen_fname); + if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); // write samples file if (sample_fname) { @@ -755,7 +755,7 @@ static void vcf_to_gensample(args_t *args) // biallelic required if ( line->n_allele>2 ) { if (!non_biallelic) - fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); + fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); non_biallelic++; continue; } @@ -774,7 +774,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -826,9 +826,9 @@ static void vcf_to_haplegendsample(args_t *args) if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0; if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0; - if (hap_fname) fprintf(pysamerr, "Haps file: %s\n", hap_fname); - if (legend_fname) fprintf(pysamerr, "Legend file: %s\n", legend_fname); - if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname); + if (hap_fname) fprintf(pysam_stderr, "Haps file: %s\n", hap_fname); + if (legend_fname) fprintf(pysam_stderr, "Legend file: %s\n", legend_fname); + if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); // write samples file if (sample_fname) { @@ -879,7 +879,7 @@ static void vcf_to_haplegendsample(args_t *args) // biallelic required if ( line->n_allele>2 ) { if (!non_biallelic) - fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); + fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); non_biallelic++; continue; } @@ -906,7 +906,7 @@ static void vcf_to_haplegendsample(args_t *args) } nok++; } - fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); + fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); if ( str.m ) free(str.s); if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno)); if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno)); @@ -968,8 +968,8 @@ static void vcf_to_hapsample(args_t *args) if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0; if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0; - if (hap_fname) fprintf(pysamerr, "Haps file: %s\n", hap_fname); - if (sample_fname) fprintf(pysamerr, "Sample file: %s\n", sample_fname); + if (hap_fname) fprintf(pysam_stderr, "Haps file: %s\n", hap_fname); + if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); // write samples file if (sample_fname) { @@ -1013,7 +1013,7 @@ static void vcf_to_hapsample(args_t *args) // biallelic required if ( line->n_allele>2 ) { if (!non_biallelic) - fprintf(pysamerr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); + fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); non_biallelic++; continue; } @@ -1029,7 +1029,7 @@ static void vcf_to_hapsample(args_t *args) } nok++; } - fprintf(pysamerr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); + fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); if ( str.m ) free(str.s); if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno)); if (hap_fname) free(hap_fname); @@ -1145,7 +1145,7 @@ static void tsv_to_vcf(args_t *args) args->header = bcf_hdr_init("w"); bcf_hdr_set_chrs(args->header, args->ref); bcf_hdr_append(args->header, "##FORMAT="); - bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); int i, n; char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); @@ -1197,13 +1197,13 @@ static void tsv_to_vcf(args_t *args) free(args->str.s); free(args->gts); - fprintf(pysamerr,"Rows total: \t%d\n", args->n.total); - fprintf(pysamerr,"Rows skipped: \t%d\n", args->n.skipped); - fprintf(pysamerr,"Missing GTs: \t%d\n", args->n.missing); - fprintf(pysamerr,"Hom RR: \t%d\n", args->n.hom_rr); - fprintf(pysamerr,"Het RA: \t%d\n", args->n.het_ra); - fprintf(pysamerr,"Hom AA: \t%d\n", args->n.hom_aa); - fprintf(pysamerr,"Het AA: \t%d\n", args->n.het_aa); + fprintf(pysam_stderr,"Rows total: \t%d\n", args->n.total); + fprintf(pysam_stderr,"Rows skipped: \t%d\n", args->n.skipped); + fprintf(pysam_stderr,"Missing GTs: \t%d\n", args->n.missing); + fprintf(pysam_stderr,"Hom RR: \t%d\n", args->n.hom_rr); + fprintf(pysam_stderr,"Het RA: \t%d\n", args->n.het_ra); + fprintf(pysam_stderr,"Hom AA: \t%d\n", args->n.hom_aa); + fprintf(pysam_stderr,"Het AA: \t%d\n", args->n.het_aa); } static void vcf_to_vcf(args_t *args) @@ -1243,7 +1243,7 @@ static void gvcf_to_vcf(args_t *args) if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); - bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); + if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); bcf_hdr_write(out_fh,hdr); int32_t *itmp = NULL, nitmp = 0; @@ -1291,65 +1291,66 @@ static void gvcf_to_vcf(args_t *args) static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Converts VCF/BCF to other formats and back. See man page for file\n"); - fprintf(pysamerr, " formats details. When specifying output files explicitly instead\n"); - fprintf(pysamerr, " of with , one can use '-' for stdout and '.' to suppress.\n"); - fprintf(pysamerr, "Usage: bcftools convert [OPTIONS] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "VCF input options:\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(pysamerr, " -i, --include select sites for which the expression is true\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --samples list of samples to include\n"); - fprintf(pysamerr, " -S, --samples-file file of samples to include\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "VCF output options:\n"); - fprintf(pysamerr, " -o, --output output file name [stdout]\n"); - fprintf(pysamerr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); - fprintf(pysamerr, " -G, --gensample2vcf <...> |,\n"); - fprintf(pysamerr, " -g, --gensample <...> |,\n"); - fprintf(pysamerr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); - fprintf(pysamerr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); - fprintf(pysamerr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "gVCF conversion:\n"); - fprintf(pysamerr, " --gvcf2vcf expand gVCF reference blocks\n"); - fprintf(pysamerr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "HAP/SAMPLE conversion (output from SHAPEIT):\n"); - fprintf(pysamerr, " --hapsample2vcf <...> |,\n"); - fprintf(pysamerr, " --hapsample <...> |,\n"); - fprintf(pysamerr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(pysamerr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "HAP/LEGEND/SAMPLE conversion:\n"); - fprintf(pysamerr, " -H, --haplegendsample2vcf <...> |,,\n"); - fprintf(pysamerr, " -h, --haplegendsample <...> |,,\n"); - fprintf(pysamerr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(pysamerr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "TSV conversion:\n"); - fprintf(pysamerr, " --tsv2vcf \n"); - fprintf(pysamerr, " -c, --columns columns of the input tsv file [ID,CHROM,POS,AA]\n"); - fprintf(pysamerr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysamerr, " -s, --samples list of sample names\n"); - fprintf(pysamerr, " -S, --samples-file file of sample names\n"); - fprintf(pysamerr, "\n"); - // fprintf(pysamerr, "PLINK options:\n"); - // fprintf(pysamerr, " -p, --plink |,,|,,|,\n"); - // fprintf(pysamerr, " --tped make tped file instead\n"); - // fprintf(pysamerr, " --bin make binary bed/fam/bim files\n"); - // fprintf(pysamerr, "\n"); - // fprintf(pysamerr, "PBWT options:\n"); - // fprintf(pysamerr, " -b, --pbwt or ,,,\n"); - // fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n"); + fprintf(pysam_stderr, " formats details. When specifying output files explicitly instead\n"); + fprintf(pysam_stderr, " of with , one can use '-' for pysam_stdout and '.' to suppress.\n"); + fprintf(pysam_stderr, "Usage: bcftools convert [OPTIONS] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "VCF input options:\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true\n"); + fprintf(pysam_stderr, " -i, --include select sites for which the expression is true\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --samples list of samples to include\n"); + fprintf(pysam_stderr, " -S, --samples-file file of samples to include\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "VCF output options:\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output output file name [pysam_stdout]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); + fprintf(pysam_stderr, " -G, --gensample2vcf <...> |,\n"); + fprintf(pysam_stderr, " -g, --gensample <...> |,\n"); + fprintf(pysam_stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); + fprintf(pysam_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(pysam_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "gVCF conversion:\n"); + fprintf(pysam_stderr, " --gvcf2vcf expand gVCF reference blocks\n"); + fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n"); + fprintf(pysam_stderr, " --hapsample2vcf <...> |,\n"); + fprintf(pysam_stderr, " --hapsample <...> |,\n"); + fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "HAP/LEGEND/SAMPLE conversion:\n"); + fprintf(pysam_stderr, " -H, --haplegendsample2vcf <...> |,,\n"); + fprintf(pysam_stderr, " -h, --haplegendsample <...> |,,\n"); + fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "TSV conversion:\n"); + fprintf(pysam_stderr, " --tsv2vcf \n"); + fprintf(pysam_stderr, " -c, --columns columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(pysam_stderr, " -s, --samples list of sample names\n"); + fprintf(pysam_stderr, " -S, --samples-file file of sample names\n"); + fprintf(pysam_stderr, "\n"); + // fprintf(pysam_stderr, "PLINK options:\n"); + // fprintf(pysam_stderr, " -p, --plink |,,|,,|,\n"); + // fprintf(pysam_stderr, " --tped make tped file instead\n"); + // fprintf(pysam_stderr, " --bin make binary bed/fam/bim files\n"); + // fprintf(pysam_stderr, "\n"); + // fprintf(pysam_stderr, "PBWT options:\n"); + // fprintf(pysam_stderr, " -b, --pbwt or ,,,\n"); + // fprintf(pysam_stderr, "\n"); exit(1); } @@ -1361,6 +1362,7 @@ int main_vcfconvert(int argc, char *argv[]) args->outfname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; static struct option loptions[] = { @@ -1389,6 +1391,7 @@ int main_vcfconvert(int argc, char *argv[]) {"haplegendsample2vcf",required_argument,NULL,'H'}, {"columns",required_argument,NULL,'c'}, {"fasta-ref",required_argument,NULL,'f'}, + {"no-version",no_argument,NULL,10}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { @@ -1426,6 +1429,7 @@ int main_vcfconvert(int argc, char *argv[]) break; case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 10 : args->record_cmd_line = 0; break; case '?': usage(); default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index ac4c3a3..f979d77 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -71,7 +71,7 @@ typedef struct _args_t int output_type, n_threads; char **argv, *output_fname, *targets_list, *regions_list; - int argc; + int argc, record_cmd_line; } args_t; @@ -149,7 +149,7 @@ static void init_data(args_t *args) } } - bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); @@ -408,6 +408,7 @@ static void usage(args_t *args) fprintf(stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); fprintf(stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); @@ -430,6 +431,7 @@ int main_vcffilter(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = @@ -448,6 +450,7 @@ int main_vcffilter(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"SnpGap",required_argument,NULL,'g'}, {"IndelGap",required_argument,NULL,'G'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; @@ -488,6 +491,7 @@ int main_vcffilter(int argc, char *argv[]) else error("The argument to -S not recognised: %s\n", optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(args); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index c731ba3..58193da 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -73,7 +73,7 @@ typedef struct _args_t int output_type, n_threads; char **argv, *output_fname, *targets_list, *regions_list; - int argc; + int argc, record_cmd_line; } args_t; @@ -131,7 +131,7 @@ static void init_data(args_t *args) if ( tmp.s ) kputs(" and ", &tmp); kputs("\"IndelGap\"", &tmp); } - fprintf(pysamerr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); + fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); free(tmp.s); } @@ -151,7 +151,7 @@ static void init_data(args_t *args) } } - bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); @@ -400,26 +400,27 @@ static void set_genotypes(args_t *args, bcf1_t *line, int pass_site) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Apply fixed-threshold filters.\n"); - fprintf(pysamerr, "Usage: bcftools filter [options] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -g, --SnpGap filter SNPs within base pairs of an indel\n"); - fprintf(pysamerr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); - fprintf(pysamerr, " -i, --include include only sites for which the expression is true (see man page for details\n"); - fprintf(pysamerr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --soft-filter annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); - fprintf(pysamerr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Apply fixed-threshold filters.\n"); + fprintf(pysam_stderr, "Usage: bcftools filter [options] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -g, --SnpGap filter SNPs within base pairs of an indel\n"); + fprintf(pysam_stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); + fprintf(pysam_stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); + fprintf(pysam_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --soft-filter annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); + fprintf(pysam_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -432,6 +433,7 @@ int main_vcffilter(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = @@ -450,6 +452,7 @@ int main_vcffilter(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"SnpGap",required_argument,NULL,'g'}, {"IndelGap",required_argument,NULL,'G'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; @@ -490,6 +493,7 @@ int main_vcffilter(int argc, char *argv[]) else error("The argument to -S not recognised: %s\n", optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(args); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index 161ca3c..2f0a288 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -62,7 +62,7 @@ void py_plot(char *script) int len = strlen(script); char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script); int ret = system(cmd); - if ( ret ) fprintf(pysamerr, "The command returned non-zero status %d: %s\n", ret, cmd); + if ( ret ) fprintf(pysam_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); free(cmd); } @@ -272,7 +272,7 @@ static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2i gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k); } } - //for (i=0; ism_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) - fprintf(pysamerr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); + fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; + FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; @@ -370,7 +370,7 @@ static void check_gt(args_t *args) { if ( tgt_isample==-1 ) { - fprintf(pysamerr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); + fprintf(pysam_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } @@ -556,12 +556,12 @@ static void cross_check_gts(args_t *args) if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) - fprintf(pysamerr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); + fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; + FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout; print_header(args, fp); if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); @@ -640,8 +640,8 @@ static void cross_check_gts(args_t *args) if ( args->tmp_arr ) free(args->tmp_arr); if ( is_hom ) free(is_hom); - if ( pl_warned ) fprintf(pysamerr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); - if ( dp_warned ) fprintf(pysamerr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); + if ( pl_warned ) fprintf(pysam_stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); + if ( dp_warned ) fprintf(pysam_stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); // Output samples sorted by average discordance double *score = (double*) calloc(nsamples,sizeof(double)); @@ -709,23 +709,23 @@ static char *init_prefix(char *prefix) static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n"); - fprintf(pysamerr, "Usage: bcftools gtcheck [options] [-g ] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -a, --all-sites output comparison for all sites\n"); - fprintf(pysamerr, " -g, --genotypes genotypes to compare against\n"); - fprintf(pysamerr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); - fprintf(pysamerr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); - fprintf(pysamerr, " -p, --plot plot\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --query-sample query sample (by default the first sample is checked)\n"); - fprintf(pysamerr, " -S, --target-sample target sample in the -g file (used only for plotting)\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n"); + fprintf(pysam_stderr, "Usage: bcftools gtcheck [options] [-g ] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -a, --all-sites output comparison for all sites\n"); + fprintf(pysam_stderr, " -g, --genotypes genotypes to compare against\n"); + fprintf(pysam_stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); + fprintf(pysam_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); + fprintf(pysam_stderr, " -p, --plot plot\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --query-sample query sample (by default the first sample is checked)\n"); + fprintf(pysam_stderr, " -S, --target-sample target sample in the -g file (used only for plotting)\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index e40fab5..d1e9179 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -1,7 +1,7 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2016 Genome Research Ltd. Author: Shane McCarthy @@ -177,6 +177,7 @@ int main_vcfindex(int argc, char *argv[]) if (stats) return vcf_index_stats(fname, stats); htsFile *fp = hts_open(fname,"r"); + if ( !fp ) error("Failed to read %s\n", fname); htsFormat type = *hts_get_format(fp); hts_close(fp); diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index 1cfde16..479fc57 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -3,7 +3,7 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2016 Genome Research Ltd. Author: Shane McCarthy @@ -40,20 +40,20 @@ DEALINGS IN THE SOFTWARE. */ static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Index bgzip compressed VCF/BCF files for random access.\n"); - fprintf(pysamerr, "Usage: bcftools index [options] |\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Indexing options:\n"); - fprintf(pysamerr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); - fprintf(pysamerr, " -f, --force overwrite index if it already exists\n"); - fprintf(pysamerr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(pysamerr, " -t, --tbi generate TBI-format index for VCF files\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Stats options:\n"); - fprintf(pysamerr, " -n, --nrecords print number of records based on existing index file\n"); - fprintf(pysamerr, " -s, --stats print per contig stats based on existing index file\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Index bgzip compressed VCF/BCF files for random access.\n"); + fprintf(pysam_stderr, "Usage: bcftools index [options] |\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Indexing options:\n"); + fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); + fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n"); + fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); + fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Stats options:\n"); + fprintf(pysam_stderr, " -n, --nrecords print number of records based on existing index file\n"); + fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -61,7 +61,7 @@ int vcf_index_stats(char *fname, int stats) { char *fn_out = NULL; FILE *out; - out = fn_out ? fopen(fn_out, "w") : stdout; + out = fn_out ? fopen(fn_out, "w") : pysam_stdout; const char **seq; int i, nseq; @@ -69,23 +69,23 @@ int vcf_index_stats(char *fname, int stats) hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); - if ( !fp ) { fprintf(pysamerr,"Could not read %s\n", fname); return 1; } + if ( !fp ) { fprintf(pysam_stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); - if ( !hdr ) { fprintf(pysamerr,"Could not read the header: %s\n", fname); return 1; } + if ( !hdr ) { fprintf(pysam_stderr,"Could not read the header: %s\n", fname); return 1; } if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); - if ( !tbx ) { fprintf(pysamerr,"Could not load TBI index: %s\n", fname); return 1; } + if ( !tbx ) { fprintf(pysam_stderr,"Could not load TBI index: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); - if ( !idx ) { fprintf(pysamerr,"Could not load CSI index: %s\n", fname); return 1; } + if ( !idx ) { fprintf(pysam_stderr,"Could not load CSI index: %s\n", fname); return 1; } } else { - fprintf(pysamerr,"Could not detect the file type as VCF or BCF: %s\n", fname); + fprintf(pysam_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } @@ -108,7 +108,7 @@ int vcf_index_stats(char *fname, int stats) bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { - fprintf(pysamerr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); + fprintf(pysam_stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); return 1; } bcf_destroy1(rec); @@ -161,17 +161,17 @@ int main_vcfindex(int argc, char *argv[]) if ( optind==argc ) usage(); if (stats>2) { - fprintf(pysamerr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); + fprintf(pysam_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); return 1; } if (tbi && min_shift>0) { - fprintf(pysamerr, "[E::%s] min-shift option only expected for CSI indices \n", __func__); + fprintf(pysam_stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__); return 1; } if (min_shift < 0 || min_shift > 30) { - fprintf(pysamerr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift); + fprintf(pysam_stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift); return 1; } @@ -179,29 +179,30 @@ int main_vcfindex(int argc, char *argv[]) if (stats) return vcf_index_stats(fname, stats); htsFile *fp = hts_open(fname,"r"); + if ( !fp ) error("Failed to read %s\n", fname); htsFormat type = *hts_get_format(fp); hts_close(fp); if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf ) { - fprintf(pysamerr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__); + fprintf(pysam_stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__); if ( type.compression!=bgzf ) - fprintf(pysamerr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__); + fprintf(pysam_stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__); return 1; } if (tbi && type.format==bcf) { - fprintf(pysamerr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n"); + fprintf(pysam_stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n"); tbi = 0; min_shift = BCF_LIDX_SHIFT; } if (min_shift == 0 && type.format==bcf) { - fprintf(pysamerr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__); + fprintf(pysam_stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__); return 1; } if (!tbi && type.format==vcf && min_shift == 0) { - fprintf(pysamerr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n"); + fprintf(pysam_stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n"); tbi = 1; } @@ -216,7 +217,7 @@ int main_vcfindex(int argc, char *argv[]) stat(fname, &stat_file); if ( stat_file.st_mtime <= stat_tbi.st_mtime ) { - fprintf(pysamerr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__); + fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__); return 1; } } @@ -226,7 +227,7 @@ int main_vcfindex(int argc, char *argv[]) { if ( bcf_index_build(fname, min_shift) != 0 ) { - fprintf(pysamerr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname); + fprintf(pysam_stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname); return 1; } } @@ -234,7 +235,7 @@ int main_vcfindex(int argc, char *argv[]) { if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 ) { - fprintf(pysamerr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname); + fprintf(pysam_stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname); return 1; } } diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 6115146..9afe620 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -58,7 +58,7 @@ typedef struct htsFile **fh_out; char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list; char *isec_exact; - int argc; + int argc, record_cmd_line; } args_t; @@ -143,7 +143,7 @@ void isec_vcf(args_t *args) out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); - bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); + if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) @@ -351,7 +351,7 @@ static void init_data(args_t *args) args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ - bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ + if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \ } if ( !args->nwrite || args->write[0] ) @@ -456,6 +456,7 @@ static void usage(void) fprintf(stderr, " -e, --exclude exclude sites for which the expression is true\n"); fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(stderr, " -i, --include include only sites for which the expression is true\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); @@ -464,8 +465,8 @@ static void usage(void) fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); fprintf(stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -492,6 +493,7 @@ int main_vcfisec(int argc, char *argv[]) args->output_fname = NULL; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -512,6 +514,7 @@ int main_vcfisec(int argc, char *argv[]) {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) { @@ -560,6 +563,7 @@ int main_vcfisec(int argc, char *argv[]) } break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 2418895..758d475 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -60,7 +60,7 @@ typedef struct htsFile **fh_out; char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list; char *isec_exact; - int argc; + int argc, record_cmd_line; } args_t; @@ -136,7 +136,7 @@ void isec_vcf(args_t *args) kstring_t str = {0,0,0}; htsFile *out_fh = NULL; - // When only one VCF is output, print VCF to stdout or -o file + // When only one VCF is output, print VCF to pysam_stdout or -o file int out_std = 0; if ( args->nwrite==1 && !args->prefix ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; @@ -145,11 +145,11 @@ void isec_vcf(args_t *args) out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); - bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); + if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) - fprintf(pysamerr,"Note: -w option not given, printing list of sites...\n"); + fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) @@ -353,7 +353,7 @@ static void init_data(args_t *args) args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ - bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ + if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \ } if ( !args->nwrite || args->write[0] ) @@ -402,7 +402,7 @@ static void init_data(args_t *args) if ( args->fh_sites == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); } else - args->fh_sites = stdout; + args->fh_sites = pysam_stdout; } } @@ -448,40 +448,41 @@ static void destroy_data(args_t *args) static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Create intersections, unions and complements of VCF files.\n"); - fprintf(pysamerr, "Usage: bcftools isec [options] [...]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(pysamerr, " -C, --complement output positions present only in the first file but missing in the others\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(pysamerr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysamerr, " -i, --include include only sites for which the expression is true\n"); - fprintf(pysamerr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysamerr, " -p, --prefix if given, subset each of the input files accordingly, see also -w\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Examples:\n"); - fprintf(pysamerr, " # Create intersection and complements of two sets saving the output in dir/*\n"); - fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, " # Filter sites in A and B (but not in C) and create intersection\n"); - fprintf(pysamerr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, " # Extract and write records from A shared by both A and B using exact allele match\n"); - fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, " # Extract records private to A or B comparing by position only\n"); - fprintf(pysamerr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Create intersections, unions and complements of VCF files.\n"); + fprintf(pysam_stderr, "Usage: bcftools isec [options] [...]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); + fprintf(pysam_stderr, " -C, --complement output positions present only in the first file but missing in the others\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true\n"); + fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(pysam_stderr, " -i, --include include only sites for which the expression is true\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -p, --prefix if given, subset each of the input files accordingly, see also -w\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Examples:\n"); + fprintf(pysam_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); + fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, " # Filter sites in A and B (but not in C) and create intersection\n"); + fprintf(pysam_stderr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); + fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, " # Extract records private to A or B comparing by position only\n"); + fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -494,6 +495,7 @@ int main_vcfisec(int argc, char *argv[]) args->output_fname = NULL; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -514,6 +516,7 @@ int main_vcfisec(int argc, char *argv[]) {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) { @@ -562,6 +565,7 @@ int main_vcfisec(int argc, char *argv[]) } break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 0517bd5..02fac6b 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -118,7 +118,7 @@ typedef struct htsFile *out_fh; bcf_hdr_t *out_hdr; char **argv; - int argc, n_threads; + int argc, n_threads, record_cmd_line; } args_t; @@ -858,7 +858,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst } if ( ith_src!=isrc ) return -1; // requested field not found int end_src = start_src; - while ( end_srcout_hdr, args->files->readers[i].header,buf,args->force_samples); } - bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); bcf_hdr_sync(args->out_hdr); } info_rules_init(args); @@ -1962,6 +1962,7 @@ static void usage(void) fprintf(stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(stderr, " -l, --file-list read file names from the file\n"); fprintf(stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); @@ -1980,6 +1981,7 @@ int main_vcfmerge(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->collapse = COLLAPSE_BOTH; int regions_is_file = 0; @@ -1998,6 +2000,7 @@ int main_vcfmerge(int argc, char *argv[]) {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"info-rules",required_argument,NULL,'i'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) { @@ -2032,6 +2035,7 @@ int main_vcfmerge(int argc, char *argv[]) case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 94b5252..daac458 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -120,7 +120,7 @@ typedef struct htsFile *out_fh; bcf_hdr_t *out_hdr; char **argv; - int argc, n_threads; + int argc, n_threads, record_cmd_line; } args_t; @@ -451,8 +451,8 @@ void merge_headers(bcf_hdr_t *hw, const bcf_hdr_t *hr, const char *clash_prefix, void debug_als(char **als, int nals) { - int k; for (k=0; knals; i++) { - printf(" %s [%d]", ma->als[i], ma->cnt[i]); + fprintf(pysam_stdout, " %s [%d]", ma->als[i], ma->cnt[i]); } - printf("\n"); + fprintf(pysam_stdout, "\n"); } void merge_chrom2qual(args_t *args, bcf1_t *out) @@ -860,7 +860,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst } if ( ith_src!=isrc ) return -1; // requested field not found int end_src = start_src; - while ( end_srctype=%d\n", __FILE__,__LINE__, info->type); exit(1); + default: fprintf(pysam_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); } #undef BRANCH } @@ -976,7 +976,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break; - default: fprintf(pysamerr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); + default: fprintf(pysam_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); } #undef BRANCH } @@ -1556,7 +1556,7 @@ void shake_buffer(maux_t *maux, int ir, int pos) if ( !reader->buffer ) return; int i; - // FILE *fp = stdout; + // FILE *fp = pysam_stdout; // fprintf(fp," nbuf=%d\t", reader->nbuffer); for (i=0; inbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n"); // debug_buffer(fp,reader); // fprintf(fp,"--\n"); @@ -1641,43 +1641,43 @@ void debug_maux(args_t *args, int pos, int var_type) maux_t *maux = args->maux; int j,k,l; - fprintf(pysamerr,"Alleles to merge at %d\n", pos+1); + fprintf(pysam_stderr,"Alleles to merge at %d\n", pos+1); for (j=0; jnreaders; j++) { bcf_sr_t *reader = &files->readers[j]; - fprintf(pysamerr," reader %d: ", j); + fprintf(pysam_stderr," reader %d: ", j); for (k=0; k<=reader->nbuffer; k++) { if ( maux->d[j][k].skip==SKIP_DONE ) continue; bcf1_t *line = reader->buffer[k]; if ( line->pos!=pos ) continue; - fprintf(pysamerr,"\t"); - if ( maux->d[j][k].skip ) fprintf(pysamerr,"["); // this record will not be merged in this round + fprintf(pysam_stderr,"\t"); + if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round for (l=0; ln_allele; l++) - fprintf(pysamerr,"%s%s", l==0?"":",", line->d.allele[l]); - if ( maux->d[j][k].skip ) fprintf(pysamerr,"]"); + fprintf(pysam_stderr,"%s%s", l==0?"":",", line->d.allele[l]); + if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"]"); } - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"\n"); } - fprintf(pysamerr," counts: "); - for (j=0; jnals; j++) fprintf(pysamerr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysamerr,"\n"); + fprintf(pysam_stderr," counts: "); + for (j=0; jnals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysam_stderr,"\n"); for (j=0; jnreaders; j++) { bcf_sr_t *reader = &files->readers[j]; - fprintf(pysamerr," out %d: ", j); + fprintf(pysam_stderr," out %d: ", j); for (k=0; k<=reader->nbuffer; k++) { if ( maux->d[j][k].skip==SKIP_DONE ) continue; bcf1_t *line = reader->buffer[k]; if ( line->pos!=pos ) continue; if ( maux->d[j][k].skip ) continue; - fprintf(pysamerr,"\t"); + fprintf(pysam_stderr,"\t"); for (l=0; ln_allele; l++) - fprintf(pysamerr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]); + fprintf(pysam_stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]); } - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"\n"); } - fprintf(pysamerr,"\n"); + fprintf(pysam_stderr,"\n"); } // Determine which line should be merged from which reader: go through all @@ -1915,7 +1915,7 @@ void merge_vcf(args_t *args) char buf[10]; snprintf(buf,10,"%d",i+1); merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); } - bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); bcf_hdr_sync(args->out_hdr); } info_rules_init(args); @@ -1950,26 +1950,27 @@ void merge_vcf(args_t *args) static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n"); - fprintf(pysamerr, " Note that only records from different files can be merged, never from the same file. For\n"); - fprintf(pysamerr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n"); - fprintf(pysamerr, "Usage: bcftools merge [options] [...]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " --force-samples resolve duplicate sample names\n"); - fprintf(pysamerr, " --print-header print only the merged header and exit\n"); - fprintf(pysamerr, " --use-header use the provided header\n"); - fprintf(pysamerr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysamerr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); - fprintf(pysamerr, " -l, --file-list read file names from the file\n"); - fprintf(pysamerr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n"); + fprintf(pysam_stderr, " Note that only records from different files can be merged, never from the same file. For\n"); + fprintf(pysam_stderr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n"); + fprintf(pysam_stderr, "Usage: bcftools merge [options] [...]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " --force-samples resolve duplicate sample names\n"); + fprintf(pysam_stderr, " --print-header print only the merged header and exit\n"); + fprintf(pysam_stderr, " --use-header use the provided header\n"); + fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(pysam_stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); + fprintf(pysam_stderr, " -l, --file-list read file names from the file\n"); + fprintf(pysam_stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -1982,6 +1983,7 @@ int main_vcfmerge(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->collapse = COLLAPSE_BOTH; int regions_is_file = 0; @@ -2000,6 +2002,7 @@ int main_vcfmerge(int argc, char *argv[]) {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"info-rules",required_argument,NULL,'i'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) { @@ -2034,6 +2037,7 @@ int main_vcfmerge(int argc, char *argv[]) case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 732eca9..781833c 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -76,6 +76,7 @@ typedef struct char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; + int record_cmd_line; } args_t; @@ -295,17 +296,19 @@ static int realign(args_t *args, bcf1_t *line) if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; } - // trim from right int ori_pos = line->pos; while (1) { // is the rightmost base identical in all alleles? + int min_len = als[0].l; for (i=1; in_allele; i++) { if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break; + if ( als[i].l < min_len ) min_len = als[i].l; } if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed + if ( min_len<=1 && line->pos==0 ) break; int pad_from_left = 0; for (i=0; in_allele; i++) // trim all alleles @@ -343,7 +346,7 @@ static int realign(args_t *args, bcf1_t *line) if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break; if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; } - if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed ntrim_left++; } if ( ntrim_left ) @@ -1287,7 +1290,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm { kstring_t *tmp = &args->tmp_str[i]; kputsn(tmp->s,tmp->l,&str); - for (j=tmp->l; jl; jntmp_arr2 = str.m; args->tmp_arr2 = (uint8_t*)str.s; @@ -1581,7 +1584,7 @@ static void normalize_vcf(args_t *args) htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out, args->n_threads); - bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); bcf_hdr_write(out, args->hdr); int prev_rid = -1, prev_pos = -1, prev_type = 0; @@ -1641,7 +1644,6 @@ static void normalize_vcf(args_t *args) if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break; j++; } - if ( args->rbuf.n==args->rbuf.m ) j = 1; if ( j>0 ) flush_buffer(args, out, j); } flush_buffer(args, out, args->rbuf.n); @@ -1666,6 +1668,7 @@ static void usage(void) fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); fprintf(stderr, " -f, --fasta-ref reference sequence\n"); fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); @@ -1674,8 +1677,8 @@ static void usage(void) fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, "\n"); exit(1); } @@ -1689,6 +1692,7 @@ int main_vcfnorm(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->aln_win = 100; args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; @@ -1714,6 +1718,7 @@ int main_vcfnorm(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"check-ref",required_argument,NULL,'c'}, {"strict-filter",no_argument,NULL,'s'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; @@ -1771,6 +1776,7 @@ int main_vcfnorm(int argc, char *argv[]) if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index 2cdf399..200ce79 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -2,7 +2,7 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Petr Danecek @@ -78,6 +78,7 @@ typedef struct char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; + int record_cmd_line; } args_t; @@ -275,7 +276,7 @@ static int realign(args_t *args, bcf1_t *line) if ( args->check_ref==CHECK_REF_EXIT ) error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); if ( args->check_ref & CHECK_REF_WARN ) - fprintf(pysamerr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + fprintf(pysam_stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); free(ref); return ERR_REF_MISMATCH; } @@ -297,17 +298,19 @@ static int realign(args_t *args, bcf1_t *line) if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; } - // trim from right int ori_pos = line->pos; while (1) { // is the rightmost base identical in all alleles? + int min_len = als[0].l; for (i=1; in_allele; i++) { if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break; + if ( als[i].l < min_len ) min_len = als[i].l; } if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed + if ( min_len<=1 && line->pos==0 ) break; int pad_from_left = 0; for (i=0; in_allele; i++) // trim all alleles @@ -345,7 +348,7 @@ static int realign(args_t *args, bcf1_t *line) if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break; if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; } - if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed ntrim_left++; } if ( ntrim_left ) @@ -855,7 +858,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf { \ /* expecting diploid gt in INFO */ \ if (nvals_ori!=lines[0]->n_allele*(lines[0]->n_allele+1)/2) { \ - fprintf(pysamerr, "todo: merge Number=G INFO fields for haploid sites\n"); \ + fprintf(pysam_stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ } \ int nvals = dst->n_allele*(dst->n_allele+1)/2; \ @@ -1289,7 +1292,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm { kstring_t *tmp = &args->tmp_str[i]; kputsn(tmp->s,tmp->l,&str); - for (j=tmp->l; jl; jntmp_arr2 = str.m; args->tmp_arr2 = (uint8_t*)str.s; @@ -1560,7 +1563,7 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) else if ( args->check_ref==CHECK_REF_EXIT ) error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1); else if ( args->check_ref & CHECK_REF_WARN ) - fprintf(pysamerr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); + fprintf(pysam_stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); } } } @@ -1583,7 +1586,7 @@ static void normalize_vcf(args_t *args) htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out, args->n_threads); - bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); bcf_hdr_write(out, args->hdr); int prev_rid = -1, prev_pos = -1, prev_type = 0; @@ -1643,42 +1646,42 @@ static void normalize_vcf(args_t *args) if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break; j++; } - if ( args->rbuf.n==args->rbuf.m ) j = 1; if ( j>0 ) flush_buffer(args, out, j); } flush_buffer(args, out, args->rbuf.n); hts_close(out); - fprintf(pysamerr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); + fprintf(pysam_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); if ( args->check_ref & CHECK_REF_FIX ) - fprintf(pysamerr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); + fprintf(pysam_stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); } static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n"); - fprintf(pysamerr, " split multiallelic sites into multiple rows; recover multiallelics from\n"); - fprintf(pysamerr, " multiple rows.\n"); - fprintf(pysamerr, "Usage: bcftools norm [options] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(pysamerr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(pysamerr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); - fprintf(pysamerr, " -f, --fasta-ref reference sequence\n"); - fprintf(pysamerr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(pysamerr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n"); + fprintf(pysam_stderr, " split multiallelic sites into multiple rows; recover multiallelics from\n"); + fprintf(pysam_stderr, " multiple rows.\n"); + fprintf(pysam_stderr, "Usage: bcftools norm [options] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); + fprintf(pysam_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); + fprintf(pysam_stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); + fprintf(pysam_stderr, " -f, --fasta-ref reference sequence\n"); + fprintf(pysam_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -1691,6 +1694,7 @@ int main_vcfnorm(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->aln_win = 100; args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; @@ -1716,6 +1720,7 @@ int main_vcfnorm(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"check-ref",required_argument,NULL,'c'}, {"strict-filter",no_argument,NULL,'s'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; @@ -1759,7 +1764,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(pysamerr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); + fprintf(pysam_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); args->rmdup = COLLAPSE_NONE<<1; break; case 's': args->strict_filter = 1; break; @@ -1773,6 +1778,7 @@ int main_vcfnorm(int argc, char *argv[]) if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index e2ca04a..87a773f 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -140,7 +140,7 @@ typedef struct _args_t char **plugin_paths; char **argv, *output_fname, *regions_list, *targets_list; - int argc, drop_header, verbose; + int argc, drop_header, verbose, record_cmd_line; } args_t; @@ -239,13 +239,6 @@ static void print_plugin_usage_hint(void) fprintf(stderr, " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n" "- Is the plugin path correct?\n\n" - "- Are all shared libraries, namely libhts.so, accessible? Verify with\n" - " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n" - " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n" - "\n" - "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n" - "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n" - "\n" "- Run \"bcftools plugin -lv\" for more detailed error output.\n" "\n", getenv("BCFTOOLS_PLUGINS") @@ -418,7 +411,7 @@ static void init_data(args_t *args) if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); - bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); @@ -460,6 +453,7 @@ static void usage(args_t *args) fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); fprintf(stderr, "VCF output options:\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); fprintf(stderr, " --threads number of extra output compression threads [0]\n"); @@ -480,12 +474,27 @@ int main_plugin(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->nplugin_paths = -1; int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0; if ( argc==1 ) usage(args); + char *plugin_name = NULL; - if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; } + if ( argv[1][0]!='-' ) + { + plugin_name = argv[1]; + argc--; + argv++; + load_plugin(args, plugin_name, 1, &args->plugin); + if ( args->plugin.run ) + { + int ret = args->plugin.run(argc, argv); + destroy_data(args); + free(args); + return ret; + } + } static struct option loptions[] = { @@ -502,6 +511,7 @@ int main_plugin(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0) @@ -527,6 +537,7 @@ int main_plugin(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': plist_only = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); @@ -535,7 +546,6 @@ int main_plugin(int argc, char *argv[]) if ( plist_only ) return list_plugins(args); if ( usage_only && ! plugin_name ) usage(args); - load_plugin(args, plugin_name, 1, &args->plugin); if ( version_only ) { const char *bver, *hver; @@ -554,15 +564,6 @@ int main_plugin(int argc, char *argv[]) return 0; } - if ( args->plugin.run ) - { - int iopt = optind; optind = 0; - int ret = args->plugin.run(argc-iopt, argv+iopt); - destroy_data(args); - free(args); - return ret; - } - char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index 5c29993..8365f7e 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -142,7 +142,7 @@ typedef struct _args_t char **plugin_paths; char **argv, *output_fname, *regions_list, *targets_list; - int argc, drop_header, verbose; + int argc, drop_header, verbose, record_cmd_line; } args_t; @@ -172,11 +172,11 @@ static void add_plugin_paths(args_t *args, const char *path) args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1)); args->plugin_paths[args->nplugin_paths] = dir; args->nplugin_paths++; - if ( args->verbose ) fprintf(pysamerr, "plugin directory %s .. ok\n", dir); + if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir); } else { - if ( args->verbose ) fprintf(pysamerr, "plugin directory %s .. %s\n", dir, strerror(errno)); + if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); free(dir); } @@ -214,8 +214,8 @@ static void *dlopen_plugin(args_t *args, const char *fname) handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though if ( args->verbose ) { - if ( !handle ) fprintf(pysamerr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); - else fprintf(pysamerr,"%s:\n\tdlopen .. ok\n", tmp); + if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); + else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", tmp); } free(tmp); if ( handle ) return handle; @@ -225,8 +225,8 @@ static void *dlopen_plugin(args_t *args, const char *fname) handle = dlopen(fname, RTLD_NOW); if ( args->verbose ) { - if ( !handle ) fprintf(pysamerr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); - else fprintf(pysamerr,"%s:\n\tdlopen .. ok\n", fname); + if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); + else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", fname); } return handle; @@ -234,20 +234,13 @@ static void *dlopen_plugin(args_t *args, const char *fname) static void print_plugin_usage_hint(void) { - fprintf(pysamerr, "\nNo functional bcftools plugins were found"); + fprintf(pysam_stderr, "\nNo functional bcftools plugins were found"); if ( !getenv("BCFTOOLS_PLUGINS") ) - fprintf(pysamerr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n"); + fprintf(pysam_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n"); else - fprintf(pysamerr, + fprintf(pysam_stderr, " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n" "- Is the plugin path correct?\n\n" - "- Are all shared libraries, namely libhts.so, accessible? Verify with\n" - " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n" - " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n" - "\n" - "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n" - "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n" - "\n" "- Run \"bcftools plugin -lv\" for more detailed error output.\n" "\n", getenv("BCFTOOLS_PLUGINS") @@ -275,19 +268,19 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) plugin->init = NULL; else - if ( args->verbose ) fprintf(pysamerr,"\tinit .. ok\n"); + if ( args->verbose ) fprintf(pysam_stderr,"\tinit .. ok\n"); plugin->run = (dl_run_f) dlsym(plugin->handle, "run"); ret = dlerror(); if ( ret ) plugin->run = NULL; else - if ( args->verbose ) fprintf(pysamerr,"\trun .. ok\n"); + if ( args->verbose ) fprintf(pysam_stderr,"\trun .. ok\n"); if ( !plugin->init && !plugin->run ) { if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); - else if ( args->verbose ) fprintf(pysamerr,"\tinit/run .. not found\n"); + else if ( args->verbose ) fprintf(pysam_stderr,"\tinit/run .. not found\n"); return -1; } @@ -296,7 +289,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) { if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name); - else if ( args->verbose ) fprintf(pysamerr,"\tversion .. not found\n"); + else if ( args->verbose ) fprintf(pysam_stderr,"\tversion .. not found\n"); return -1; } @@ -344,12 +337,12 @@ static void init_plugin(args_t *args) args->plugin.version(&bver, &hver); if ( strcmp(bver,bcftools_version()) && !warned_bcftools ) { - fprintf(pysamerr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver); + fprintf(pysam_stderr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver); warned_bcftools = 1; } if ( strcmp(hver,hts_version()) && !warned_htslib ) { - fprintf(pysamerr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver); + fprintf(pysam_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver); warned_htslib = 1; } args->drop_header += ret; @@ -401,8 +394,8 @@ static int list_plugins(args_t *args) qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name); for (i=0; ifilter_str ) args->filter = filter_init(args->hdr, args->filter_str); - bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); @@ -449,28 +442,29 @@ static void destroy_data(args_t *args) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Run user defined plugin\n"); - fprintf(pysamerr, "Usage: bcftools plugin [OPTIONS] [-- PLUGIN_OPTIONS]\n"); - fprintf(pysamerr, " bcftools +name [OPTIONS] [-- PLUGIN_OPTIONS]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "VCF input options:\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(pysamerr, " -i, --include select sites for which the expression is true\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, "VCF output options:\n"); - fprintf(pysamerr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysamerr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "Plugin options:\n"); - fprintf(pysamerr, " -h, --help list plugin's options\n"); - fprintf(pysamerr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(pysamerr, " -v, --verbose print debugging information on plugin failure\n"); - fprintf(pysamerr, " -V, --version print version string and exit\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Run user defined plugin\n"); + fprintf(pysam_stderr, "Usage: bcftools plugin [OPTIONS] [-- PLUGIN_OPTIONS]\n"); + fprintf(pysam_stderr, " bcftools +name [OPTIONS] [-- PLUGIN_OPTIONS]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "VCF input options:\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true\n"); + fprintf(pysam_stderr, " -i, --include select sites for which the expression is true\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, "VCF output options:\n"); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(pysam_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "Plugin options:\n"); + fprintf(pysam_stderr, " -h, --help list plugin's options\n"); + fprintf(pysam_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); + fprintf(pysam_stderr, " -v, --verbose print debugging information on plugin failure\n"); + fprintf(pysam_stderr, " -V, --version print version string and exit\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -482,12 +476,27 @@ int main_plugin(int argc, char *argv[]) args->output_fname = "-"; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; args->nplugin_paths = -1; int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0; if ( argc==1 ) usage(args); + char *plugin_name = NULL; - if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; } + if ( argv[1][0]!='-' ) + { + plugin_name = argv[1]; + argc--; + argv++; + load_plugin(args, plugin_name, 1, &args->plugin); + if ( args->plugin.run ) + { + int ret = args->plugin.run(argc, argv); + destroy_data(args); + free(args); + return ret; + } + } static struct option loptions[] = { @@ -504,6 +513,7 @@ int main_plugin(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0) @@ -529,6 +539,7 @@ int main_plugin(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': plist_only = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); @@ -537,34 +548,24 @@ int main_plugin(int argc, char *argv[]) if ( plist_only ) return list_plugins(args); if ( usage_only && ! plugin_name ) usage(args); - load_plugin(args, plugin_name, 1, &args->plugin); if ( version_only ) { const char *bver, *hver; args->plugin.version(&bver, &hver); - printf("bcftools %s using htslib %s\n", bcftools_version(), hts_version()); - printf("plugin at %s using htslib %s\n\n", bver, hver); + fprintf(pysam_stdout, "bcftools %s using htslib %s\n", bcftools_version(), hts_version()); + fprintf(pysam_stdout, "plugin at %s using htslib %s\n\n", bver, hver); return 0; } if ( usage_only ) { if ( args->plugin.usage ) - fprintf(pysamerr,"%s",args->plugin.usage()); + fprintf(pysam_stderr,"%s",args->plugin.usage()); else - fprintf(pysamerr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name); + fprintf(pysam_stderr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name); return 0; } - if ( args->plugin.run ) - { - int iopt = optind; optind = 0; - int ret = args->plugin.run(argc-iopt, argv+iopt); - destroy_data(args); - free(args); - return ret; - } - char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index 1265b57..10f56f1 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -156,7 +156,7 @@ static void list_columns(args_t *args) int i; bcf_sr_t *reader = &args->files->readers[0]; for (i=0; iheader); i++) - printf("%s\n", reader->header->samples[i]); + fprintf(pysam_stdout, "%s\n", reader->header->samples[i]); } static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc) @@ -178,30 +178,30 @@ static int compare_header(bcf_hdr_t *hdr, char **a, int na, char **b, int nb) static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n"); - fprintf(pysamerr, "Usage: bcftools query [options] [ [...]]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -c, --collapse collapse lines with duplicate positions for , see man page [none]\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -f, --format see man page for details\n"); - fprintf(pysamerr, " -H, --print-header print header\n"); - fprintf(pysamerr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -l, --list-samples print the list of samples and exit\n"); - fprintf(pysamerr, " -o, --output-file output file name [stdout]\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --samples list of samples to include\n"); - fprintf(pysamerr, " -S, --samples-file file of samples to include\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " -u, --allow-undef-tags print \".\" for undefined tags\n"); - fprintf(pysamerr, " -v, --vcf-list process multiple VCFs listed in the file\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Examples:\n"); - fprintf(pysamerr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n"); + fprintf(pysam_stderr, "Usage: bcftools query [options] [ [...]]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -c, --collapse collapse lines with duplicate positions for , see man page [none]\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -f, --format see man page for details\n"); + fprintf(pysam_stderr, " -H, --print-header print header\n"); + fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -l, --list-samples print the list of samples and exit\n"); + fprintf(pysam_stderr, " -o, --output-file output file name [pysam_stdout]\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --samples list of samples to include\n"); + fprintf(pysam_stderr, " -S, --samples-file file of samples to include\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n"); + fprintf(pysam_stderr, " -v, --vcf-list process multiple VCFs listed in the file\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Examples:\n"); + fprintf(pysam_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -300,7 +300,7 @@ int main_vcfquery(int argc, char *argv[]) } if ( !args->format_str ) usage(); - args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout; + args->out = args->fn_out ? fopen(args->fn_out, "w") : pysam_stdout; if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( !args->vcf_list ) diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c index fa64b79..9560559 100644 --- a/bcftools/vcfroh.c +++ b/bcftools/vcfroh.c @@ -368,14 +368,31 @@ static void flush_viterbi(args_t *args) } } - // update the transition matrix tprob + // update the transition matrix + int n = 1; for (i=0; i<2; i++) { - int n = 0; for (j=0; j<2; j++) n += MAT(tcounts,2,i,j); - if ( !n) error("fixme: state %d not observed\n", i+1); - for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n; } + for (i=0; i<2; i++) + { + for (j=0; j<2; j++) + { + // no transition to i-th state was observed, set to a small number + if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n; + else MAT(tcounts,2,i,j) /= n; + } + } + + // normalize + for (i=0; i<2; i++) + { + double norm = 0; + for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i); + assert( norm!=0 ); + for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm; + } + if ( args->genmap_fname || args->rec_rate > 0 ) hmm_set_tprob(args->hmm, tcounts, 0); else @@ -385,14 +402,16 @@ static void flush_viterbi(args_t *args) deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev); delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev); niter++; - - fprintf(stderr,"%d: %f %f\n", niter,deltaz,delthw); + fprintf(stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", + niter,deltaz,delthw, + MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), + MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); } while ( deltaz > 0.0 || delthw > 0.0 ); - fprintf(stderr, "Viterbi training converged in %d iterations to", niter); double *tprob_arr = hmm_get_tprob(args->hmm); - for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(stderr, " %f", MAT(tprob_arr,2,i,j)); - fprintf(stderr, "\n"); + fprintf(stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter, + MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), + MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); // output the results for (i=0; inrids; i++) @@ -400,12 +419,16 @@ static void flush_viterbi(args_t *args) int ioff = args->rid_offs[i]; int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); + hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); + double *fwd = hmm_get_fwd_bwd_prob(args->hmm); const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]); for (j=0; jsites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0); + int state = vpath[j*2]; + double pval = fwd[j*2 + state]; + printf("%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval); } } } diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index 92a9a4f..66ddc17 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -167,12 +167,12 @@ static void init_data(args_t *args) args->hmm = hmm_init(2, tprob, 10000); // print header - printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); - printf("# The command line was:\tbcftools %s", args->argv[0]); + fprintf(pysam_stdout, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); + fprintf(pysam_stdout, "# The command line was:\tbcftools %s", args->argv[0]); for (i=1; iargc; i++) - printf(" %s",args->argv[i]); - printf("\n#\n"); - printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); + fprintf(pysam_stdout, " %s",args->argv[i]); + fprintf(pysam_stdout, "\n#\n"); + fprintf(pysam_stdout, "# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); } static void destroy_data(args_t *args) @@ -336,7 +336,7 @@ static void flush_viterbi(args_t *args) { int state = vpath[i*2]==STATE_AZ ? 1 : 0; double *pval = fwd + i*2; - printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state])); + fprintf(pysam_stdout, "%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state])); } return; } @@ -370,14 +370,31 @@ static void flush_viterbi(args_t *args) } } - // update the transition matrix tprob + // update the transition matrix + int n = 1; for (i=0; i<2; i++) { - int n = 0; for (j=0; j<2; j++) n += MAT(tcounts,2,i,j); - if ( !n) error("fixme: state %d not observed\n", i+1); - for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n; } + for (i=0; i<2; i++) + { + for (j=0; j<2; j++) + { + // no transition to i-th state was observed, set to a small number + if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n; + else MAT(tcounts,2,i,j) /= n; + } + } + + // normalize + for (i=0; i<2; i++) + { + double norm = 0; + for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i); + assert( norm!=0 ); + for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm; + } + if ( args->genmap_fname || args->rec_rate > 0 ) hmm_set_tprob(args->hmm, tcounts, 0); else @@ -387,14 +404,16 @@ static void flush_viterbi(args_t *args) deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev); delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev); niter++; - - fprintf(pysamerr,"%d: %f %f\n", niter,deltaz,delthw); + fprintf(pysam_stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", + niter,deltaz,delthw, + MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), + MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); } while ( deltaz > 0.0 || delthw > 0.0 ); - fprintf(pysamerr, "Viterbi training converged in %d iterations to", niter); double *tprob_arr = hmm_get_tprob(args->hmm); - for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(pysamerr, " %f", MAT(tprob_arr,2,i,j)); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter, + MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW), + MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ)); // output the results for (i=0; inrids; i++) @@ -402,12 +421,16 @@ static void flush_viterbi(args_t *args) int ioff = args->rid_offs[i]; int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); + hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); + double *fwd = hmm_get_fwd_bwd_prob(args->hmm); const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]); for (j=0; jsites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0); + int state = vpath[j*2]; + double pval = fwd[j*2 + state]; + fprintf(pysam_stdout, "%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval); } } } @@ -624,7 +647,7 @@ static void vcfroh(args_t *args, bcf1_t *line) if ( skip_rid ) { - fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line)); + fprintf(pysam_stderr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line)); args->skip_rid = line->rid; return; } @@ -657,30 +680,30 @@ static void vcfroh(args_t *args, bcf1_t *line) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: HMM model for detecting runs of autozygosity.\n"); - fprintf(pysamerr, "Usage: bcftools roh [options] \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "General Options:\n"); - fprintf(pysamerr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); - fprintf(pysamerr, " --AF-tag use TAG for allele frequency\n"); - fprintf(pysamerr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(pysamerr, " -e, --estimate-AF calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in \n"); - fprintf(pysamerr, " -G, --GTs-only use GTs, ignore PLs, use for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n"); - fprintf(pysamerr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); - fprintf(pysamerr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n"); - fprintf(pysamerr, " -M, --rec-rate constant recombination rate per bp\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --sample sample to analyze\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "HMM Options:\n"); - fprintf(pysamerr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); - fprintf(pysamerr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); - fprintf(pysamerr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: HMM model for detecting runs of autozygosity.\n"); + fprintf(pysam_stderr, "Usage: bcftools roh [options] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "General Options:\n"); + fprintf(pysam_stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); + fprintf(pysam_stderr, " --AF-tag use TAG for allele frequency\n"); + fprintf(pysam_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(pysam_stderr, " -e, --estimate-AF calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in \n"); + fprintf(pysam_stderr, " -G, --GTs-only use GTs, ignore PLs, use for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n"); + fprintf(pysam_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); + fprintf(pysam_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n"); + fprintf(pysam_stderr, " -M, --rec-rate constant recombination rate per bp\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --sample sample to analyze\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "HMM Options:\n"); + fprintf(pysam_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); + fprintf(pysam_stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); + fprintf(pysam_stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -787,7 +810,7 @@ int main_vcfroh(int argc, char *argv[]) vcfroh(args, args->files->readers[0].buffer[0]); } vcfroh(args, NULL); - fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); + fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); destroy_data(args); free(args); return 0; diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c index 32e7213..58875f6 100644 --- a/bcftools/vcfsom.c.pysam.c +++ b/bcftools/vcfsom.c.pysam.c @@ -104,7 +104,7 @@ char *msprintf(const char *fmt, ...) /* * char *t, *p = str; * t = column_next(p, '\t'); - * if ( strlen("")==t-p && !strncmp(p,"",t-p) ) printf("found!\n"); + * if ( strlen("")==t-p && !strncmp(p,"",t-p) ) fprintf(pysam_stdout, "found!\n"); * * char *t; * t = column_next(str, '\t'); if ( !*t ) error("expected field\n", str); @@ -574,7 +574,7 @@ static void do_train(args_t *args) fprintf(fp,"%e\t%f\t%f\n", prev_score, (float)igood/ngood, (float)ibad/nbad); if ( !printed && (float)igood/ngood > 0.9 ) { - printf("%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score); + fprintf(pysam_stdout, "%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score); printed = 1; } @@ -582,7 +582,7 @@ static void do_train(args_t *args) else if ( igoodprefix,strerror(errno)); @@ -607,36 +607,36 @@ static void do_classify(args_t *args) case MERGE_MAX: score = get_max_score(args, -1); break; case MERGE_AVG: score = get_avg_score(args, -1); break; } - printf("%e\n", 1.0 - score/max_score); + fprintf(pysam_stdout, "%e\n", 1.0 - score/max_score); } annots_reader_close(args); } static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: SOM (Self-Organizing Map) filtering.\n"); - fprintf(pysamerr, "Usage: bcftools som --train [options] \n"); - fprintf(pysamerr, " bcftools som --classify [options]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Model training options:\n"); - fprintf(pysamerr, " -f, --nfold n-fold cross-validation (number of maps) [5]\n"); - fprintf(pysamerr, " -p, --prefix prefix of output files\n"); - fprintf(pysamerr, " -s, --size map size [20]\n"); - fprintf(pysamerr, " -t, --train \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Classifying options:\n"); - fprintf(pysamerr, " -c, --classify \n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Experimental training options (no reason to change):\n"); - fprintf(pysamerr, " -b, --bmu-threshold threshold for selection of best-matching unit [0.9]\n"); - fprintf(pysamerr, " -d, --som-dimension SOM dimension [2]\n"); - fprintf(pysamerr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n"); - fprintf(pysamerr, " -l, --learning-rate learning rate [1.0]\n"); - fprintf(pysamerr, " -m, --merge -f merge algorithm [avg]\n"); - fprintf(pysamerr, " -n, --ntrain-sites effective number of training sites [number of good sites]\n"); - fprintf(pysamerr, " -r, --random-seed random seed, 0 for time() [1]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: SOM (Self-Organizing Map) filtering.\n"); + fprintf(pysam_stderr, "Usage: bcftools som --train [options] \n"); + fprintf(pysam_stderr, " bcftools som --classify [options]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Model training options:\n"); + fprintf(pysam_stderr, " -f, --nfold n-fold cross-validation (number of maps) [5]\n"); + fprintf(pysam_stderr, " -p, --prefix prefix of output files\n"); + fprintf(pysam_stderr, " -s, --size map size [20]\n"); + fprintf(pysam_stderr, " -t, --train \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Classifying options:\n"); + fprintf(pysam_stderr, " -c, --classify \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Experimental training options (no reason to change):\n"); + fprintf(pysam_stderr, " -b, --bmu-threshold threshold for selection of best-matching unit [0.9]\n"); + fprintf(pysam_stderr, " -d, --som-dimension SOM dimension [2]\n"); + fprintf(pysam_stderr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n"); + fprintf(pysam_stderr, " -l, --learning-rate learning rate [1.0]\n"); + fprintf(pysam_stderr, " -m, --merge -f merge algorithm [avg]\n"); + fprintf(pysam_stderr, " -n, --ntrain-sites effective number of training sites [number of good sites]\n"); + fprintf(pysam_stderr, " -r, --random-seed random seed, 0 for time() [1]\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -692,7 +692,7 @@ int main_vcfsom(int argc, char *argv[]) case 'd': args->ndim = atoi(optarg); if ( args->ndim<2 ) error("Expected -d >=2, got %d\n", args->ndim); - if ( args->ndim>3 ) fprintf(pysamerr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim); + if ( args->ndim>3 ) fprintf(pysam_stderr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim); break; case 't': args->action = SOM_TRAIN; break; case 'c': args->action = SOM_CLASSIFY; break; diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index fcbc15b..5653760 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -195,17 +195,17 @@ static inline int idist_i2bin(idist_t *d, int i) static void _indel_ctx_print1(_idc1_t *idc) { int i; - fprintf(stdout, "%d\t", idc->cnt); + fprintf(pysam_stdout, "%d\t", idc->cnt); for (i=0; ilen; i++) - fputc(idc->seq[i], stdout); - fputc('\n', stdout); + fputc(idc->seq[i], pysam_stdout); + fputc('\n', pysam_stdout); } static void _indel_ctx_print(indel_ctx_t *ctx) { int i; for (i=0; indat; i++) _indel_ctx_print1(&ctx->dat[i]); - fputc('\n',stdout); + fputc('\n',pysam_stdout); } #endif static int _indel_ctx_lookup(indel_ctx_t *ctx, char *seq, int seq_len, int *hit) @@ -317,9 +317,9 @@ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, i } #if IC_DBG - fprintf(stdout,"ref: %s\n", ref); - fprintf(stdout,"alt: %s\n", alt); - fprintf(stdout,"ctx: %s\n", fai_ref); + fprintf(pysam_stdout,"ref: %s\n", ref); + fprintf(pysam_stdout,"alt: %s\n", alt); + fprintf(pysam_stdout,"ctx: %s\n", fai_ref); _indel_ctx_print(ctx); #endif @@ -900,7 +900,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(pysamerr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(pysam_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT } @@ -1010,7 +1010,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int { nmm++; bcf_sr_t *reader = &files->readers[0]; - printf("DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); + fprintf(pysam_stdout, "DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); } else { @@ -1019,7 +1019,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int } } float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; - printf("PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); + fprintf(pysam_stdout, "PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); } } } @@ -1089,38 +1089,38 @@ static void do_vcf_stats(args_t *args) static void print_header(args_t *args) { int i; - printf("# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version()); - printf("# The command line was:\tbcftools %s ", args->argv[0]); + fprintf(pysam_stdout, "# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version()); + fprintf(pysam_stdout, "# The command line was:\tbcftools %s ", args->argv[0]); for (i=1; iargc; i++) - printf(" %s",args->argv[i]); - printf("\n#\n"); + fprintf(pysam_stdout, " %s",args->argv[i]); + fprintf(pysam_stdout, "\n#\n"); - printf("# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n"); + fprintf(pysam_stdout, "# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n"); if ( args->files->nreaders==1 ) { const char *fname = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : ""; if ( args->split_by_id ) { - printf("ID\t0\t%s:known (sites with ID different from \".\")\n", fname); - printf("ID\t1\t%s:novel (sites where ID column is \".\")\n", fname); + fprintf(pysam_stdout, "ID\t0\t%s:known (sites with ID different from \".\")\n", fname); + fprintf(pysam_stdout, "ID\t1\t%s:novel (sites where ID column is \".\")\n", fname); } else - printf("ID\t0\t%s\n", fname); + fprintf(pysam_stdout, "ID\t0\t%s\n", fname); } else { const char *fname0 = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : ""; const char *fname1 = strcmp("-",args->files->readers[1].fname) ? args->files->readers[1].fname : ""; - printf("ID\t0\t%s\n", fname0); - printf("ID\t1\t%s\n", fname1); - printf("ID\t2\t%s\t%s\n", fname0,fname1); + fprintf(pysam_stdout, "ID\t0\t%s\n", fname0); + fprintf(pysam_stdout, "ID\t1\t%s\n", fname1); + fprintf(pysam_stdout, "ID\t2\t%s\t%s\n", fname0,fname1); if ( args->verbose_sites ) { - printf( + fprintf(pysam_stdout, "# Verbose per-site discordance output.\n" "# PSD\t[2]CHROM\t[3]POS\t[4]Number of matches\t[5]Number of mismatches\t[6]NRD\n"); - printf( + fprintf(pysam_stdout, "# Verbose per-site and per-sample output. Genotype codes: %d:HomRefRef, %d:HomAltAlt, %d:HetAltRef, %d:HetAltAlt, %d:haploidRef, %d:haploidAlt\n" "# DBG\t[2]CHROM\t[3]POS\t[4]Sample\t[5]GT in %s\t[6]GT in %s\n", GT_HOM_RR, GT_HOM_AA, GT_HET_RA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A, fname0,fname1); @@ -1132,42 +1132,42 @@ static void print_header(args_t *args) static void print_stats(args_t *args) { int i, id; - printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n"); + fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) - printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); + fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); - printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); - printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); - printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); - printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); - printf("SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); - printf("SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); - printf("SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); + fprintf(pysam_stdout, "SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); + fprintf(pysam_stdout, "SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); + fprintf(pysam_stdout, "SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); + fprintf(pysam_stdout, "SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); + fprintf(pysam_stdout, "SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); + fprintf(pysam_stdout, "SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); + fprintf(pysam_stdout, "SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); + fprintf(pysam_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); } - printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); + fprintf(pysam_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; int ts=0,tv=0; for (i=0; im_af; i++) { ts += stats->af_ts[i]; tv += stats->af_tv[i]; } - printf("TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0); + fprintf(pysam_stdout, "TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0); } if ( args->exons_fname ) { - printf("# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n"); + fprintf(pysam_stdout, "# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n"); for (id=0; idnstats; id++) { int in=args->stats[id].in_frame, out=args->stats[id].out_frame, na=args->stats[id].na_frame; int in1=args->stats[id].in_frame_alt1, out1=args->stats[id].out_frame_alt1, na1=args->stats[id].na_frame_alt1; - printf("FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0); + fprintf(pysam_stdout, "FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0); } } if ( args->indel_ctx ) { - printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); + fprintf(pysam_stdout, "# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { int nc = 0, ni = 0, na = args->stats[id].n_repeat_na; @@ -1176,25 +1176,25 @@ static void print_stats(args_t *args) nc += args->stats[id].n_repeat[i][0] + args->stats[id].n_repeat[i][2]; ni += args->stats[id].n_repeat[i][1] + args->stats[id].n_repeat[i][3]; } - printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0); + fprintf(pysam_stdout, "ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0); } - printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); + fprintf(pysam_stdout, "# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { for (i=1; istats[id].n_repeat[i][0]+args->stats[id].n_repeat[i][2], ni = args->stats[id].n_repeat[i][1]+args->stats[id].n_repeat[i][3]; - printf("ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1, + fprintf(pysam_stdout, "ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1, args->stats[id].n_repeat[i][0],args->stats[id].n_repeat[i][1],args->stats[id].n_repeat[i][2],args->stats[id].n_repeat[i][3], nc+ni ? (float)nc/(nc+ni) : 0.0); } } } - printf("# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); + fprintf(pysam_stdout, "# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - printf("SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0], + fprintf(pysam_stdout, "SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0], stats->af_repeats[0][0]+stats->af_repeats[1][0]+stats->af_repeats[2][0],stats->af_repeats[0][0],stats->af_repeats[1][0],stats->af_repeats[2][0]); // put the singletons stats into the first AF bin, note that not all of the stats is transferred (i.e. nrd mismatches) stats->af_snps[1] += stats->af_snps[0]; @@ -1204,32 +1204,32 @@ static void print_stats(args_t *args) stats->af_repeats[1][1] += stats->af_repeats[1][0]; stats->af_repeats[2][1] += stats->af_repeats[2][0]; } - printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); + fprintf(pysam_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=1; im_af; i++) // note that af[1] now contains also af[0], see SiS stats output above { if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue; - printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], + fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]); } } #if QUAL_STATS - printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); + fprintf(pysam_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; im_qual; i++) { if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue; - printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); + fprintf(pysam_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); } } #endif for (i=0; inusr; i++) { - printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", + fprintf(pysam_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag); for (id=0; idnstats; id++) { @@ -1240,32 +1240,32 @@ static void print_stats(args_t *args) if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n"; - printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); + fprintf(pysam_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } } - printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); + fprintf(pysam_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=stats->m_indel-1; i>=0; i--) - if ( stats->deletions[i] ) printf("IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); + if ( stats->deletions[i] ) fprintf(pysam_stdout, "IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); for (i=0; im_indel; i++) - if ( stats->insertions[i] ) printf("IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); + if ( stats->insertions[i] ) fprintf(pysam_stdout, "IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); } - printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); + fprintf(pysam_stdout, "# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); for (id=0; idnstats; id++) { int t; for (t=0; t<15; t++) { if ( t>>2 == (t&3) ) continue; - printf("ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]); + fprintf(pysam_stdout, "ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]); } } if ( args->files->nreaders>1 && args->files->n_smpl ) { - printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); + fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); int x; for (x=0; x<2; x++) @@ -1273,12 +1273,12 @@ static void print_stats(args_t *args) gtcmp_t *stats; if ( x==0 ) { - printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); stats = args->af_gts_snps; } else { - printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); stats = args->af_gts_indels; } uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0}; @@ -1292,28 +1292,28 @@ static void print_stats(args_t *args) nrd_mm[j] += stats[i].mm[j]; } if ( !i || !n ) continue; // skip singleton stats and empty bins - printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1)); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); - printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n); + fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1)); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); + fprintf(pysam_stdout, "\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n); } if ( x==0 ) { - printf("# NRD and discordance is calculated as follows:\n"); - printf("# m .. number of matches\n"); - printf("# x .. number of mismatches\n"); - printf("# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); - printf("# RR discordance = xRR / (xRR + mRR)\n"); - printf("# RA discordance = xRA / (xRA + mRA)\n"); - printf("# AA discordance = xAA / (xAA + mAA)\n"); - printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); + fprintf(pysam_stdout, "# NRD and discordance is calculated as follows:\n"); + fprintf(pysam_stdout, "# m .. number of matches\n"); + fprintf(pysam_stdout, "# x .. number of mismatches\n"); + fprintf(pysam_stdout, "# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); + fprintf(pysam_stdout, "# RR discordance = xRR / (xRR + mRR)\n"); + fprintf(pysam_stdout, "# RA discordance = xRA / (xRA + mRA)\n"); + fprintf(pysam_stdout, "# AA discordance = xAA / (xAA + mAA)\n"); + fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); } else - printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); + fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)]; uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)]; - printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', + fprintf(pysam_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', m+mm ? mm*100.0/(m+mm) : 0, nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0, nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)] ? nrd_mm[T2S(GT_HET_RA)]*100.0/(nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)]) : 0, @@ -1327,13 +1327,13 @@ static void print_stats(args_t *args) smpl_r_t *smpl_r_array; if ( x==0 ) { - printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); + fprintf(pysam_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_snps; smpl_r_array = args->smpl_r_snps; } else { - printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); + fprintf(pysam_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_indels; smpl_r_array = args->smpl_r_indels; } @@ -1350,16 +1350,16 @@ static void print_stats(args_t *args) double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n; r = (sum_crossprod)/sqrt(x2_xx*y2_yy); } - printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); - if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r); - else printf("\t"NA_STRING"\n"); + fprintf(pysam_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]); + fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]); + if (smpl_r->n && !isnan(r)) fprintf(pysam_stdout, "\t%f\n", r*r); + else fprintf(pysam_stdout, "\t"NA_STRING"\n"); } } } - printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); + fprintf(pysam_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1368,32 +1368,32 @@ static void print_stats(args_t *args) for (i=0; idp.m_vals; i++) { if ( stats->dp.vals[i]==0 && stats->dp_sites.vals[i]==0 ) continue; - printf("DP\t%d\t", id); - if ( i==0 ) printf("<%d", stats->dp.min); - else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max); - else printf("%d", idist_i2bin(&stats->dp,i)); - printf("\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); - printf("\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); + fprintf(pysam_stdout, "DP\t%d\t", id); + if ( i==0 ) fprintf(pysam_stdout, "<%d", stats->dp.min); + else if ( i+1==stats->dp.m_vals ) fprintf(pysam_stdout, ">%d", stats->dp.max); + else fprintf(pysam_stdout, "%d", idist_i2bin(&stats->dp,i)); + fprintf(pysam_stdout, "\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); + fprintf(pysam_stdout, "\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); } } if ( args->files->n_smpl ) { - printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n"); + fprintf(pysam_stdout, "# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; - printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i], + fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]); } } - printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); + fprintf(pysam_stdout, "# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1408,12 +1408,12 @@ static void print_stats(args_t *args) } int nhom = stats->smpl_indel_homs[i]; int nhet = stats->smpl_indel_hets[i]; - printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); + fprintf(pysam_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); } } #ifdef HWE_STATS - printf("# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n"); + fprintf(pysam_stdout, "# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1426,28 +1426,28 @@ static void print_stats(args_t *args) if ( !sum_tot ) continue; int nprn = 3; - printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot); + fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot); for (j=0; jnaf_hwe; j++) { sum_tmp += ptr[j]; float frac = (float)sum_tmp/sum_tot; if ( frac >= 0.75 ) { - while (nprn>0) { printf("\t%f", (float)j/args->naf_hwe); nprn--; } + while (nprn>0) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } break; } if ( frac >= 0.5 ) { - while (nprn>1) { printf("\t%f", (float)j/args->naf_hwe); nprn--; } + while (nprn>1) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } continue; } if ( frac >= 0.25 ) { - while (nprn>2) { printf("\t%f", (float)j/args->naf_hwe); nprn--; } + while (nprn>2) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } } } assert(nprn==0); - printf("\n"); + fprintf(pysam_stdout, "\n"); } } #endif @@ -1456,32 +1456,32 @@ static void print_stats(args_t *args) static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n"); - fprintf(pysamerr, " When two files are given, the program generates separate stats for intersection\n"); - fprintf(pysamerr, " and the complements. By default only sites are compared, -s/-S must given to include\n"); - fprintf(pysamerr, " also sample columns.\n"); - fprintf(pysamerr, "Usage: bcftools stats [options] []\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); - fprintf(pysamerr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(pysamerr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); - fprintf(pysamerr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); - fprintf(pysamerr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysamerr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); - fprintf(pysamerr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); - fprintf(pysamerr, " -S, --samples-file file of samples to include\n"); - fprintf(pysamerr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysamerr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysamerr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); - fprintf(pysamerr, " -v, --verbose produce verbose per-site and per-sample output\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n"); + fprintf(pysam_stderr, " When two files are given, the program generates separate stats for intersection\n"); + fprintf(pysam_stderr, " and the complements. By default only sites are compared, -s/-S must given to include\n"); + fprintf(pysam_stderr, " also sample columns.\n"); + fprintf(pysam_stderr, "Usage: bcftools stats [options] []\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); + fprintf(pysam_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); + fprintf(pysam_stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); + fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(pysam_stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); + fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); + fprintf(pysam_stderr, " -S, --samples-file file of samples to include\n"); + fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(pysam_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(pysam_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); + fprintf(pysam_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index ed41595..c14075d 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -72,6 +72,7 @@ typedef struct _args_t int sample_is_file, force_samples; char *include_types, *exclude_types; int include, exclude; + int record_cmd_line; htsFile *out; } args_t; @@ -86,7 +87,8 @@ static void init_data(args_t *args) bcf_hdr_append(args->hdr,"##INFO="); bcf_hdr_append(args->hdr,"##INFO="); } - bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); + else bcf_hdr_sync(args->hdr); // setup sample data if (args->sample_names) @@ -485,6 +487,7 @@ static void usage(args_t *args) fprintf(stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output-file output file name [stdout]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); @@ -529,6 +532,7 @@ int main_vcfview(int argc, char *argv[]) args->update_info = 1; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -569,6 +573,7 @@ int main_vcfview(int argc, char *argv[]) {"max-af",required_argument,NULL,'Q'}, {"phased",no_argument,NULL,'p'}, {"exclude-phased",no_argument,NULL,'P'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; @@ -678,6 +683,7 @@ int main_vcfview(int argc, char *argv[]) break; } case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case '?': usage(args); default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index a6a0cc0..53b7c53 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -74,6 +74,7 @@ typedef struct _args_t int sample_is_file, force_samples; char *include_types, *exclude_types; int include, exclude; + int record_cmd_line; htsFile *out; } args_t; @@ -88,7 +89,8 @@ static void init_data(args_t *args) bcf_hdr_append(args->hdr,"##INFO="); bcf_hdr_append(args->hdr,"##INFO="); } - bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); + else bcf_hdr_sync(args->hdr); // setup sample data if (args->sample_names) @@ -112,7 +114,7 @@ static void init_data(args_t *args) for (i=0; iforce_samples) { - fprintf(pysamerr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); + fprintf(pysam_stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); } else { error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } @@ -133,7 +135,7 @@ static void init_data(args_t *args) for (i=0; iforce_samples) { - fprintf(pysamerr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); + fprintf(pysam_stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); continue; } else { error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); @@ -147,7 +149,7 @@ static void init_data(args_t *args) free(smpl); khash_str2int_destroy(hdr_samples); if (args->n_samples == 0) { - fprintf(pysamerr, "Warn: subsetting has removed all samples\n"); + fprintf(pysam_stderr, "Warn: subsetting has removed all samples\n"); args->sites_only = 1; } } @@ -158,7 +160,7 @@ static void init_data(args_t *args) // determine variant types to include/exclude if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { - fprintf(pysamerr, "Error: only supply one of --include-types, --exclude-types options\n"); + fprintf(pysam_stderr, "Error: only supply one of --include-types, --exclude-types options\n"); exit(1); } char **type_list = 0; @@ -186,8 +188,8 @@ static void init_data(args_t *args) else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER; else { - fprintf(pysamerr, "[E::%s] unknown type\n", type_list[i]); - fprintf(pysamerr, "Accepted types are snps, indels, mnps, other\n"); + fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]); + fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } @@ -200,8 +202,8 @@ static void init_data(args_t *args) else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER; else { - fprintf(pysamerr, "[E::%s] unknown type\n", type_list[i]); - fprintf(pysamerr, "Accepted types are snps, indels, mnps, other\n"); + fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]); + fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } @@ -290,7 +292,7 @@ int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: fprintf(pysamerr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(pysam_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if (!sample_phased) { @@ -479,44 +481,45 @@ void set_allele_type (int *atype, char *atype_string) static void usage(args_t *args) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n"); - fprintf(pysamerr, "Usage: bcftools view [options] [region1 [...]]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Output options:\n"); - fprintf(pysamerr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); - fprintf(pysamerr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); - fprintf(pysamerr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); - fprintf(pysamerr, " -o, --output-file output file name [stdout]\n"); - fprintf(pysamerr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysamerr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysamerr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysamerr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(pysamerr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(pysamerr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Subset options:\n"); - fprintf(pysamerr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); - fprintf(pysamerr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(pysamerr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(pysamerr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(pysamerr, " --force-samples only warn about unknown subset samples\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Filter options:\n"); - fprintf(pysamerr, " -c/C, --min-ac/--max-ac [:] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(pysamerr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(pysamerr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysamerr, " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n"); - fprintf(pysamerr, " -i/e, --include/--exclude select/exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysamerr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n"); - fprintf(pysamerr, " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); - fprintf(pysamerr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n"); - fprintf(pysamerr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(pysamerr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(pysamerr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); - fprintf(pysamerr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n"); - fprintf(pysamerr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n"); + fprintf(pysam_stderr, "Usage: bcftools view [options] [region1 [...]]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Output options:\n"); + fprintf(pysam_stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); + fprintf(pysam_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); + fprintf(pysam_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); + fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(pysam_stderr, " -o, --output-file output file name [pysam_stdout]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(pysam_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(pysam_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Subset options:\n"); + fprintf(pysam_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); + fprintf(pysam_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); + fprintf(pysam_stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(pysam_stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(pysam_stderr, " --force-samples only warn about unknown subset samples\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Filter options:\n"); + fprintf(pysam_stderr, " -c/C, --min-ac/--max-ac [:] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(pysam_stderr, " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n"); + fprintf(pysam_stderr, " -i/e, --include/--exclude select/exclude sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n"); + fprintf(pysam_stderr, " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); + fprintf(pysam_stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n"); + fprintf(pysam_stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(pysam_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); + fprintf(pysam_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n"); + fprintf(pysam_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(pysam_stderr, "\n"); exit(1); } @@ -531,6 +534,7 @@ int main_vcfview(int argc, char *argv[]) args->update_info = 1; args->output_type = FT_VCF; args->n_threads = 0; + args->record_cmd_line = 1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -571,6 +575,7 @@ int main_vcfview(int argc, char *argv[]) {"max-af",required_argument,NULL,'Q'}, {"phased",no_argument,NULL,'p'}, {"exclude-phased",no_argument,NULL,'P'}, + {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; char *tmp; @@ -680,6 +685,7 @@ int main_vcfview(int argc, char *argv[]) break; } case 9 : args->n_threads = strtol(optarg, 0, 0); break; + case 8 : args->record_cmd_line = 0; break; case '?': usage(args); default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index 1fd0d4e..af54532 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -41,7 +41,7 @@ void error(const char *format, ...) { va_list ap; va_start(ap, format); - vfprintf(pysamerr, format, ap); + vfprintf(pysam_stderr, format, ap); va_end(ap); exit(-1); } diff --git a/bcftools/version.h b/bcftools/version.h index 70d4f93..05929f5 100644 --- a/bcftools/version.h +++ b/bcftools/version.h @@ -1 +1 @@ -#define BCFTOOLS_VERSION "1.3" +#define BCFTOOLS_VERSION "1.3.1" diff --git a/doc/faq.rst b/doc/faq.rst index 1f45981..d5d84c4 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -10,7 +10,7 @@ use the github URL: https://github.com/pysam-developers/pysam. As pysam is a wrapper around htslib and the samtools package, I suggest cite `Li et al (2009) `. -Is pysam thread-save? +Is pysam thread-safe? ===================== Pysam is a mix of python and C code. Instructions within python are diff --git a/doc/glossary.rst b/doc/glossary.rst index f40bcfb..e35a537 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -81,7 +81,8 @@ Glossary In alignments with soft clipping part of the query sequence are not aligned. The unaligned query sequence is still part - of the alignment record. This is in difference to hard clipped reads. + of the alignment record. This is in difference to + :term:`hard clipped` reads. hard clipping hard clipped diff --git a/doc/installation.rst b/doc/installation.rst index a3fa2a2..2dbf2a4 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -58,8 +58,3 @@ python 2.7 contains pre-built C-files and cython needs not be present during installation. However, when installing the source tarball on python 3 or building from the repository, these pre-built C-files are not present and cython needs to be installed beforehand. - - - - - diff --git a/doc/release.rst b/doc/release.rst index 802c6e5..f49b8f0 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,26 @@ Release notes ============= +Release 0.9.1 +============= + +This is a bugfix release addressing some installation problems +in pysam 0.9.0, in particular: + +* patch included htslib to work with older libcurl versions, fixes #262. +* do not require cython for python 3 install, fixes #260 +* FastaFile does not accept filepath_index any more, see #270 +* add AlignedSegment.get_cigar_stats method. +* py3 bugfix in VariantFile.subset_samples, fixes #272 +* add missing sysconfig import, fixes #278 +* do not redirect stdout, but instead write to a separately + created file. This should resolve issues when pysam is used + in notebooks or other environments that redirect stdout. +* wrap htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1 +* use bgzf throughout instead of gzip +* allow specifying a fasta reference for CRAM file when opening + for both read and write, fixes #280 + Release 0.9.0 ============= diff --git a/import.py b/import.py index 4018698..12d2016 100644 --- a/import.py +++ b/import.py @@ -12,16 +12,49 @@ # For samtools, type: # rm -rf samtools # python import.py samtools download/samtools +# # Manually, then: # modify config.h to set compatibility flags -# change bamtk.c.pysam.c/main to bamtk.c.pysam.c/samtools_main # # For bcftools, type: # rm -rf bedtools # python import.py bedtools download/bedtools +# rm -rf bedtools/test bedtools/plugins + +import fnmatch import os +import re +import shutil import sys -import fnmatch +import hashlib + + +EXCLUDE = { + "samtools": ( + "razip.c", "bgzip.c", "main.c", + "calDepth.c", "bam2bed.c", "wgsim.c", + "md5fa.c", "md5sum-lite.c", "maq2sam.c", + "bamcheck.c", "chk_indel.c", "vcf-miniview.c", + "htslib-1.3", # do not import twice + "hfile_irods.c", # requires irods library + ), + "bcftools": ( + "test", "plugins", "peakfit.c", + "peakfit.h", + # needs to renamed, name conflict with samtools reheader + "reheader.c", + "polysomy.c"), + "htslib": ( + 'htslib/tabix.c', 'htslib/bgzip.c', + 'htslib/htsfile.c', 'htslib/hfile_irods.c'), +} + + +MAIN = { + "samtools": "bamtk", + "bcftools": "main" +} + def locate(pattern, root=os.curdir): @@ -35,20 +68,57 @@ def locate(pattern, root=os.curdir): def _update_pysam_files(cf, destdir): '''update pysam files applying redirection of ouput''' + basename = os.path.basename(destdir) for filename in cf: if not filename: continue dest = filename + ".pysam.c" with open(filename) as infile: + lines = "".join(infile.readlines()) with open(dest, "w") as outfile: outfile.write('#include "pysam.h"\n\n') - outfile.write( - re.sub("stderr", "pysamerr", "".join(infile.readlines()))) + subname, _ = os.path.splitext(os.path.basename(filename)) + if subname in MAIN.get(basename, []): + lines = re.sub("int main\(", "int {}_main(".format( + basename), lines) + else: + lines = re.sub("int main\(", "int {}_{}_main(".format( + basename, subname), lines) + lines = re.sub("stderr", "pysam_stderr", lines) + lines = re.sub("stdout", "pysam_stdout", lines) + lines = re.sub(" printf\(", " fprintf(pysam_stdout, ", lines) + lines = re.sub("([^kf])puts\(([^)]+)\)", + r"\1fputs(\2, pysam_stdout) & fputc('\\n', pysam_stdout)", + lines) + lines = re.sub("putchar\(([^)]+)\)", + r"fputc(\1, pysam_stdout)", lines) + + fn = os.path.basename(filename) + # some specific fixes: + SPECIFIC_SUBSTITUTIONS = { + "bam_md.c": ( + 'sam_open_format("-", mode_w', + 'sam_open_format(pysam_stdout_fn, mode_w'), + "phase.c": ( + 'putc("ACGT"[f->seq[j] == 1? (c&3, pysam_stdout) : (c>>16&3)]);', + 'putc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], pysam_stdout);'), + "cut_target.c": ( + 'putc(33 + (cns[j]>>8>>2, pysam_stdout));', + 'putc(33 + (cns[j]>>8>>2), pysam_stdout);') + } + if fn in SPECIFIC_SUBSTITUTIONS: + lines = lines.replace( + SPECIFIC_SUBSTITUTIONS[fn][0], + SPECIFIC_SUBSTITUTIONS[fn][1]) + outfile.write(lines) + with open(os.path.join(destdir, "pysam.h"), "w")as outfile: outfile.write("""#ifndef PYSAM_H #define PYSAM_H #include "stdio.h" -extern FILE * pysamerr; +extern FILE * pysam_stderr; +extern FILE * pysam_stdout; +extern const char * pysam_stdout_fn; #endif """) @@ -57,7 +127,7 @@ if len(sys.argv) >= 1: if len(sys.argv) != 3: raise ValueError("import requires dest src") - dest, srcdir = sys.argv[2:4] + dest, srcdir = sys.argv[1:3] if dest not in EXCLUDE: raise ValueError("import expected one of %s" % ",".join(EXCLUDE.keys())) diff --git a/pysam/__init__.py b/pysam/__init__.py index cd32bf5..d1b5d41 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -1,5 +1,6 @@ import os import sys +import sysconfig from pysam.libchtslib import * from pysam.cutils import * @@ -23,6 +24,7 @@ import pysam.Pileup as Pileup from pysam.samtools import * import pysam.config + # export all the symbols from separate modules __all__ = \ libchtslib.__all__ +\ diff --git a/pysam/calignedsegment.pyx b/pysam/calignedsegment.pyx index 0a2b94f..f4e0750 100644 --- a/pysam/calignedsegment.pyx +++ b/pysam/calignedsegment.pyx @@ -63,6 +63,7 @@ from cpython cimport array as c_array from cpython.version cimport PY_MAJOR_VERSION from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize from libc.string cimport strchr +from cpython cimport array as c_array from pysam.cutils cimport force_bytes, force_str, \ charptr_to_str, charptr_to_bytes @@ -76,14 +77,15 @@ cdef char * parray_types = 'bBhHiIf' # translation tables # cigar code to character and vice versa -cdef char* CODE2CIGAR= "MIDNSHP=X" +cdef char* CODE2CIGAR= "MIDNSHP=XB" +cdef int NCIGAR_CODES = 10 if PY_MAJOR_VERSION >= 3: CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) else: CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) -CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=X])") +CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") ##################################################################### # typecode guessing @@ -93,16 +95,16 @@ cdef inline char map_typecode_htslib_to_python(uint8_t s): # map type from htslib to python array cdef char * f = strchr(htslib_types, s) + if f == NULL: - raise ValueError("unknown htslib tag typecode '%s'" % chr(s)) + return 0 return parray_types[f - htslib_types] cdef inline uint8_t map_typecode_python_to_htslib(char s): """determine value type from type code of array""" cdef char * f = strchr(parray_types, s) if f == NULL: - raise ValueError( - "unknown conversion for array typecode '%s'" % s) + return 0 return htslib_types[f - parray_types] # optional tag data manipulation @@ -229,6 +231,8 @@ cdef inline packTags(tags): """ fmts, args = ["<"], [] + cdef char array_typecode + datatype2format = { b'c': ('b', 1), b'C': ('B', 1), @@ -273,9 +277,14 @@ cdef inline packTags(tags): elif isinstance(value, array.array): # binary tags from arrays if valuetype is None: - valuetype = force_bytes(chr( - map_typecode_python_to_htslib(ord(value.typecode)))) + array_typecode = map_typecode_python_to_htslib(ord(value.typecode)) + + if array_typecode == 0: + raise ValueError("unsupported type code '{}'" + .format(value.typecode)) + valuetype = force_bytes(chr(array_typecode)) + if valuetype not in datatype2format: raise ValueError("invalid value type '%s' (%s)" % (valuetype, type(valuetype))) @@ -501,6 +510,13 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): with the cigar string to reconstitute the query or the reference sequence. + Positions corresponding to `N` (skipped region from the reference) + in the CIGAR string will not appear in the returned sequence. The + MD should correspondingly not contain these. Thus proper tags are:: + + Deletion from the reference: cigar=5M1D5M MD=5^C5 + Skipped region from reference: cigar=5M1N5M MD=10 + Returns ------- @@ -542,10 +558,12 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): s[s_idx] = read_sequence[r_idx] r_idx += 1 s_idx += 1 - elif op == BAM_CDEL or op == BAM_CREF_SKIP: + elif op == BAM_CDEL: for i from 0 <= i < l: s[s_idx] = '-' s_idx += 1 + elif op == BAM_CREF_SKIP: + pass elif op == BAM_CINS: for i from 0 <= i < l: # encode insertions into reference as lowercase @@ -1409,10 +1427,12 @@ cdef class AlignedSegment: for i from 0 <= i < l: result.append(ref_seq[r_idx]) r_idx += 1 - elif op == BAM_CDEL or op == BAM_CREF_SKIP: + elif op == BAM_CDEL: for i from 0 <= i < l: result.append(ref_seq[r_idx]) r_idx += 1 + elif op == BAM_CREF_SKIP: + pass elif op == BAM_CINS: r_idx += l elif op == BAM_CSOFT_CLIP: @@ -1426,7 +1446,6 @@ cdef class AlignedSegment: return "".join(result) - def get_aligned_pairs(self, matches_only=False, with_seq=False): """a list of aligned read (query) and reference positions. @@ -1505,7 +1524,7 @@ cdef class AlignedSegment: else: qpos += l - elif op == BAM_CDEL or op == BAM_CREF_SKIP: + elif op == BAM_CDEL: if not _matches_only: if _with_seq: for i from pos <= i < pos + l: @@ -1519,6 +1538,17 @@ cdef class AlignedSegment: elif op == BAM_CHARD_CLIP: pass # advances neither + elif op == BAM_CREF_SKIP: + if not _matches_only: + if _with_seq: + for i from pos <= i < pos + l: + result.append((None, i, None)) + else: + for i from pos <= i < pos + l: + result.append((None, i)) + + pos += l + elif op == BAM_CPAD: raise NotImplementedError( "Padding (BAM_CPAD, 6) is currently not supported. " @@ -1597,6 +1627,81 @@ cdef class AlignedSegment: return overlap + def get_cigar_stats(self): + """summary of operations in cigar string. + + The output order in the array is "MIDNSHP=X" followed by a + field for the NM tag. If the NM tag is not present, this + field will always be 0. + + +-----+--------------+-----+ + |M |BAM_CMATCH |0 | + +-----+--------------+-----+ + |I |BAM_CINS |1 | + +-----+--------------+-----+ + |D |BAM_CDEL |2 | + +-----+--------------+-----+ + |N |BAM_CREF_SKIP |3 | + +-----+--------------+-----+ + |S |BAM_CSOFT_CLIP|4 | + +-----+--------------+-----+ + |H |BAM_CHARD_CLIP|5 | + +-----+--------------+-----+ + |P |BAM_CPAD |6 | + +-----+--------------+-----+ + |= |BAM_CEQUAL |7 | + +-----+--------------+-----+ + |X |BAM_CDIFF |8 | + +-----+--------------+-----+ + |NM |NM tag |9 | + +-----+--------------+-----+ + + If no cigar string is present, empty arrays will be returned. + + Parameters + ---------- + + Returns + ------- + + arrays : two arrays. The first contains the nucleotide counts within + each cigar operation, the second contains the number of blocks for + each cigar operation. + + """ + + cdef int nfields = NCIGAR_CODES + 1 + + cdef c_array.array base_counts = array.array( + "I", + [0] * nfields) + cdef uint32_t [:] base_view = base_counts + cdef c_array.array block_counts = array.array( + "I", + [0] * nfields) + cdef uint32_t [:] block_view = block_counts + + cdef bam1_t * src = self._delegate + cdef int op + cdef uint32_t l + cdef int32_t k + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + + if cigar_p == NULL: + return None + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + base_view[op] += l + block_view[op] += 1 + + cdef uint8_t * v = bam_aux_get(src, 'NM') + if v != NULL: + base_view[nfields - 1] = bam_aux2i(v) + + return base_counts, block_counts + ##################################################### ## Unsorted as yet # TODO: capture in CIGAR object diff --git a/pysam/calignmentfile.pxd b/pysam/calignmentfile.pxd index a7e956d..3384e7e 100644 --- a/pysam/calignmentfile.pxd +++ b/pysam/calignmentfile.pxd @@ -39,6 +39,7 @@ ctypedef struct __iterdata: cdef class AlignmentFile: cdef object _filename + cdef object _reference_filename # pointer to htsFile structure cdef htsFile * htsfile diff --git a/pysam/calignmentfile.pyx b/pysam/calignmentfile.pyx index f258a66..6473220 100644 --- a/pysam/calignmentfile.pyx +++ b/pysam/calignmentfile.pyx @@ -112,7 +112,8 @@ VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), "UR", "SP"), "RG" : ("ID", "SM", "LB", "DS", "PU", "PI", "CN", "DT", - "PL", "FO", "KS", "PG"), + "PL", "FO", "KS", "PG", + "PM"), "PG" : ("PN", "ID", "VN", "CL", "PP"),} @@ -218,7 +219,7 @@ cdef class AlignmentFile: """AlignmentFile(filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=NULL, header=None, add_sq_text=False, check_header=True, check_sq=True, - filename=None) + reference_filename=None, filename=None) A :term:`SAM`/:term:`BAM` formatted file. @@ -248,8 +249,8 @@ cdef class AlignmentFile: 4. The names (`reference_names`) and lengths (`reference_lengths`) are supplied directly as lists. - For writing a CRAM file, the filename of the reference can be - added through a fasta formatted file (`reference_filename`) + When reading or writing a CRAM file, the filename of a FASTA-formatted + reference can be specified with `reference_filename`. By default, if a file is opened in mode 'r', it is checked for a valid header (`check_header` = True) and a definition of @@ -311,6 +312,12 @@ cdef class AlignmentFile: when reading, check if SQ entries are present in header (default=True) + reference_filename : string + Path to a FASTA-formatted reference file. Valid only for CRAM files. + When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL + specified in the header (``UR`` tag), which are normally used to find + the reference. + filename : string Alternative to filepath_or_object. Filename of the file to be opened. @@ -390,6 +397,7 @@ cdef class AlignmentFile: will be closed and a new file will be opened. ''' cdef char *cfilename + cdef char *creference_filename cdef char *cindexname cdef char *cmode @@ -433,6 +441,8 @@ cdef class AlignmentFile: cdef bytes bmode = mode.encode('ascii') self._filename = filename = encode_filename(filename) + self._reference_filename = reference_filename = encode_filename( + reference_filename) # FIXME: Use htsFormat when it is available self.is_stream = filename == b"-" @@ -515,10 +525,8 @@ cdef class AlignmentFile: # is given, the CRAM reference arrays will be built from # the @SQ header in the header if self.is_cram and reference_filename: - # note that fn_aux takes ownership, so create - # a copy - fn = encode_filename(reference_filename) - self.htsfile.fn_aux = strdup(fn) + # note that fn_aux takes ownership, so create a copy + self.htsfile.fn_aux = strdup(self._reference_filename) # write header to htsfile if self.is_bam or self.is_cram or "h" in mode: @@ -570,6 +578,13 @@ cdef class AlignmentFile: "- is it SAM format?" % mode ) # self.header.ignore_sam_err = True + # set filename with reference sequences + if self.is_cram and reference_filename: + creference_filename = self._reference_filename + hts_set_opt(self.htsfile, + CRAM_OPT_REFERENCE, + creference_filename) + if check_sq and self.header.n_targets == 0: raise ValueError( ("file has no sequences defined (mode='%s') - " @@ -854,7 +869,7 @@ cdef class AlignmentFile: multiple_iterators : bool - If `multiple_iterators` is True (default) multiple + If `multiple_iterators` is True, multiple iterators on the same file can be used at the same time. The iterator returned will receive its own copy of a filehandle to the file effectively re-opening the file. Re-opening a file @@ -1665,6 +1680,7 @@ cdef class IteratorRow: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): cdef char *cfilename + cdef char *creference_filename if not samfile.is_open(): raise ValueError("I/O operation on closed file") @@ -1686,6 +1702,13 @@ cdef class IteratorRow: self.header = sam_hdr_read(self.htsfile) assert self.header != NULL self.owns_samfile = True + # options specific to CRAM files + if samfile.is_cram and samfile._reference_filename: + creference_filename = samfile._reference_filename + hts_set_opt(self.htsfile, + CRAM_OPT_REFERENCE, + creference_filename) + else: self.htsfile = self.samfile.htsfile self.owns_samfile = False diff --git a/pysam/cbcf.pyx b/pysam/cbcf.pyx index 2a19850..41fd44f 100644 --- a/pysam/cbcf.pyx +++ b/pysam/cbcf.pyx @@ -1807,7 +1807,7 @@ cdef class VariantHeader(object): 'missing {:d} requested samples'.format( len(missing_samples))) - keep_samples = force_bytes(b','.join(keep_samples)) + keep_samples = force_bytes(','.join(keep_samples)) cdef char *keep = keep_samples if keep_samples else NULL cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) diff --git a/pysam/cfaidx.pxd b/pysam/cfaidx.pxd index d3aff09..7749274 100644 --- a/pysam/cfaidx.pxd +++ b/pysam/cfaidx.pxd @@ -6,7 +6,7 @@ from libc.stdio cimport FILE, printf cimport cython from cpython cimport array -from pysam.chtslib cimport faidx_t, gzFile, kstring_t +from pysam.chtslib cimport faidx_t, kstring_t, BGZF # These functions are put here and not in chtslib.pxd in order # to avoid warnings for unused functions. @@ -21,13 +21,10 @@ cdef extern from "pysam_stream.h" nogil: kstring_t seq kstring_t qual - gzFile gzopen(char *, char *) - kseq_t *kseq_init(gzFile) + kseq_t *kseq_init(BGZF *) int kseq_read(kseq_t *) void kseq_destroy(kseq_t *) - int gzclose(gzFile) - - kstream_t *ks_init(gzFile) + kstream_t *ks_init(BGZF *) void ks_destroy(kstream_t *) # Retrieve characters from stream until delimiter @@ -62,9 +59,10 @@ cdef class PersistentFastqProxy: cdef class FastxFile: cdef object _filename - cdef gzFile fastqfile + cdef BGZF * fastqfile cdef kseq_t * entry cdef bint persist + cdef bint is_remote cdef kseq_t * getCurrent(self) cdef int cnext(self) diff --git a/pysam/cfaidx.pyx b/pysam/cfaidx.pyx index 4db754e..78f9aac 100644 --- a/pysam/cfaidx.pyx +++ b/pysam/cfaidx.pyx @@ -60,7 +60,8 @@ from cpython.version cimport PY_MAJOR_VERSION from pysam.chtslib cimport \ faidx_nseq, fai_load, fai_destroy, fai_fetch, \ faidx_seq_len, \ - faidx_fetch_seq, gzopen, gzclose, hisremote + faidx_fetch_seq, hisremote, \ + bgzf_open, bgzf_close from pysam.cutils cimport force_bytes, force_str, charptr_to_str from pysam.cutils cimport encode_filename, from_string_and_size @@ -136,6 +137,11 @@ cdef class FastaFile: cdef char *cfilename = self._filename self.is_remote = hisremote(cfilename) + if filepath_index is not None: + raise NotImplementedError( + "setting an explicit path for the index " + "is not implemented") + # open file for reading if (self._filename != b"-" and not self.is_remote @@ -171,7 +177,9 @@ cdef class FastaFile: self.fastafile = NULL def __dealloc__(self): - self.close() + if self.fastafile != NULL: + fai_destroy(self.fastafile) + self.fastafile = NULL # context manager interface def __enter__(self): @@ -464,30 +472,40 @@ cdef class FastxFile: on the file continues. ''' - self.close() + if self.fastqfile != NULL: + self.close() - if not os.path.exists(filename): - raise IOError("no such file or directory: %s" % filename) + self._filename = encode_filename(filename) + cdef char *cfilename = self._filename + self.is_remote = hisremote(cfilename) + + # open file for reading + if (self._filename != b"-" + and not self.is_remote + and not os.path.exists(filename)): + raise IOError("file `%s` not found" % filename) self.persist = persist - self._filename = encode_filename(filename) - cdef char *cfilename = self._filename with nogil: - self.fastqfile = gzopen(cfilename, "r") + self.fastqfile = bgzf_open(cfilename, "r") self.entry = kseq_init(self.fastqfile) self._filename = filename def close(self): '''close the file.''' + if self.fastqfile != NULL: + bgzf_close(self.fastqfile) + self.fastqfile = NULL if self.entry != NULL: - gzclose(self.fastqfile) - if self.entry: - kseq_destroy(self.entry) - self.entry = NULL + kseq_destroy(self.entry) + self.entry = NULL def __dealloc__(self): - self.close() + if self.fastqfile != NULL: + bgzf_close(self.fastqfile) + if self.entry: + kseq_destroy(self.entry) # context manager interface def __enter__(self): diff --git a/pysam/chtslib.pxd b/pysam/chtslib.pxd index 0cee075..33c1559 100644 --- a/pysam/chtslib.pxd +++ b/pysam/chtslib.pxd @@ -9,20 +9,6 @@ cdef extern from "Python.h": FILE* PyFile_AsFile(object) -cdef extern from "zlib.h" nogil: - ctypedef void * gzFile - ctypedef int64_t z_off_t - - int gzclose(gzFile fp) - int gzread(gzFile fp, void *buf, unsigned int n) - char *gzerror(gzFile fp, int *errnum) - - gzFile gzopen( char *path, char *mode) - gzFile gzdopen (int fd, char *mode) - char * gzgets(gzFile file, char *buf, int len) - int gzeof(gzFile file) - - cdef extern from "htslib/kstring.h" nogil: ctypedef struct kstring_t: size_t l, m @@ -398,6 +384,29 @@ cdef extern from "htslib/hts.h" nogil: no_compression, gzip, bgzf, custom compression_maximum + enum hts_fmt_option: + CRAM_OPT_DECODE_MD, + CRAM_OPT_PREFIX, + CRAM_OPT_VERBOSITY, + CRAM_OPT_SEQS_PER_SLICE, + CRAM_OPT_SLICES_PER_CONTAINER, + CRAM_OPT_RANGE, + CRAM_OPT_VERSION, + CRAM_OPT_EMBED_REF, + CRAM_OPT_IGNORE_MD5, + CRAM_OPT_REFERENCE, + CRAM_OPT_MULTI_SEQ_PER_SLICE, + CRAM_OPT_NO_REF, + CRAM_OPT_USE_BZIP2, + CRAM_OPT_SHARED_REF, + CRAM_OPT_NTHREADS, + CRAM_OPT_THREAD_POOL, + CRAM_OPT_USE_LZMA, + CRAM_OPT_USE_RANS, + CRAM_OPT_REQUIRED_FIELDS, + HTS_OPT_COMPRESSION_LEVEL, + HTS_OPT_NTHREADS, + ctypedef struct htsVersion: short major, minor @@ -519,7 +528,7 @@ cdef extern from "htslib/hts.h" nogil: # @param opt The CRAM_OPT_* option. # @param ... Optional arguments, dependent on the option used. # @return 0 for success, or negative if an error occurred. - #int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) + int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) int hts_getline(htsFile *fp, int delimiter, kstring_t *str) char **hts_readlines(const char *fn, int *_n) diff --git a/pysam/ctabix.pxd b/pysam/ctabix.pxd index 39eed77..028090e 100644 --- a/pysam/ctabix.pxd +++ b/pysam/ctabix.pxd @@ -14,7 +14,7 @@ cdef extern from "unistd.h" nogil: int close(int fd) from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \ - gzFile, tbx_t, kstring_t + tbx_t, kstring_t, BGZF # These functions are put here and not in chtslib.pxd in order # to avoid warnings for unused functions. @@ -29,13 +29,10 @@ cdef extern from "pysam_stream.h" nogil: kstring_t seq kstring_t qual - gzFile gzopen(char *, char *) - kseq_t *kseq_init(gzFile) + kseq_t *kseq_init(BGZF *) int kseq_read(kseq_t *) void kseq_destroy(kseq_t *) - int gzclose(gzFile) - - kstream_t *ks_init(gzFile) + kstream_t *ks_init(BGZF *) void ks_destroy(kstream_t *) # Retrieve characters from stream until delimiter @@ -47,7 +44,7 @@ cdef extern from "pysam_stream.h" nogil: cdef class tabix_file_iterator: - cdef gzFile fh + cdef BGZF * fh cdef kstream_t * kstream cdef kstring_t buffer cdef size_t size @@ -104,7 +101,7 @@ cdef class TabixIteratorParsed(TabixIterator): cdef class GZIterator: cdef object _filename - cdef gzFile gzipfile + cdef BGZF * gzipfile cdef kstream_t * kstream cdef kstring_t buffer cdef int __cnext__(self) diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx index 0bb1284..a23fa87 100644 --- a/pysam/ctabix.pyx +++ b/pysam/ctabix.pyx @@ -69,10 +69,10 @@ from cpython.version cimport PY_MAJOR_VERSION cimport pysam.ctabixproxies as ctabixproxies from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ - BGZF, bgzf_open, bgzf_close, bgzf_write, gzFile, \ + BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ - tbx_destroy, gzopen, gzclose, gzerror, gzdopen, hisremote + tbx_destroy, hisremote from pysam.cutils cimport force_bytes, force_str, charptr_to_str from pysam.cutils cimport encode_filename, from_string_and_size @@ -532,7 +532,7 @@ cdef class TabixFile: cdef int x result = [] for x from 0 <= x < nsequences: - result.append(sequences[x]) + result.append(force_str(sequences[x])) # htslib instructions: # only free container, not the sequences themselves @@ -681,7 +681,7 @@ cdef class GZIterator: filename = encode_filename(filename) cdef char *cfilename = filename with nogil: - self.gzipfile = gzopen(cfilename, "r") + self.gzipfile = bgzf_open(cfilename, "r") self._filename = filename self.kstream = ks_init(self.gzipfile) self.encoding = encoding @@ -693,11 +693,12 @@ cdef class GZIterator: def __dealloc__(self): '''close file.''' if self.gzipfile != NULL: - gzclose(self.gzipfile) + bgzf_close(self.gzipfile) self.gzipfile = NULL if self.buffer.s != NULL: free(self.buffer.s) - ks_destroy(self.kstream) + if self.kstream != NULL: + ks_destroy(self.kstream) def __iter__(self): return self @@ -1003,10 +1004,10 @@ def tabix_index( filename, ######################################################### ## Iterators for parsing through unindexed files. ######################################################### -cdef buildGzipError(void *gzfp): - cdef int errnum = 0 - cdef char *s = gzerror(gzfp, &errnum) - return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) +# cdef buildGzipError(void *gzfp): +# cdef int errnum = 0 +# cdef char *s = gzerror(gzfp, &errnum) +# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) cdef class tabix_file_iterator: @@ -1034,7 +1035,7 @@ cdef class tabix_file_iterator: # in this case gzread will directly read from the file without decompression. # When reading, this will be detected automatically by looking # for the magic two-byte gzip header. - self.fh = gzdopen(self.duplicated_fd, 'r') + self.fh = bgzf_dopen(self.duplicated_fd, 'r') if self.fh == NULL: raise IOError('%s' % strerror(errno)) @@ -1076,14 +1077,14 @@ cdef class tabix_file_iterator: # gzgets terminates at \n, no need to test # parser creates a copy - return self.parser.parse( b, self.buffer.l) + return self.parser.parse(b, self.buffer.l) raise StopIteration def __dealloc__(self): free(self.buffer.s) ks_destroy(self.kstream) - gzclose(self.fh) + bgzf_close(self.fh) def __next__(self): return self.__cnext__() diff --git a/pysam/ctabixproxies.pyx b/pysam/ctabixproxies.pyx index d72f082..f5288cc 100644 --- a/pysam/ctabixproxies.pyx +++ b/pysam/ctabixproxies.pyx @@ -8,6 +8,8 @@ from libc.stdlib cimport atoi, atol, atof from pysam.cutils cimport force_bytes, force_str, charptr_to_str from pysam.cutils cimport encode_filename, from_string_and_size +import collections + cdef char *StrOrEmpty(char * buffer): if buffer == NULL: return "" @@ -88,7 +90,8 @@ cdef class TupleProxy: elif op == 3: # != operator return self.compare(other) != 0 else: - return NotImplemented + err_msg = "op {0} isn't implemented yet".format(op) + raise NotImplementedError(err_msg) cdef take(self, char * buffer, size_t nbytes): '''start presenting buffer. @@ -390,6 +393,8 @@ cdef class GTFProxy(TupleProxy): def __get__(self): return self._getindex(1) def __set__(self, value): + if value is None: + value = "." self._setindex(1, value) property feature: @@ -397,6 +402,8 @@ cdef class GTFProxy(TupleProxy): def __get__(self): return self._getindex(2) def __set__(self, value): + if value is None: + value = "." self._setindex(2, value) property start: @@ -423,29 +430,40 @@ cdef class GTFProxy(TupleProxy): return float(v) def __set__(self, value): - self._setindex(5, value) + if value is None: + value = "." + self._setindex(5, str(value)) property strand: '''feature strand.''' - def __get__(self ): - return self._getindex(6) + def __get__(self): + return self._getindex(6) def __set__(self, value ): + if value is None: + value = "." self._setindex(6, value) property frame: '''feature frame.''' def __get__(self): - return self._getindex(7) + v = self._getindex(7) + if v == "" or v[0] == '.': + return v + else: + return int(v) + def __set__(self, value): - self._setindex(7, value) + if value is None: + value = "." + self._setindex(7, str(value)) property attributes: '''feature attributes (as a string).''' def __get__(self): if self.hasOwnAttributes: - return self._attributes + return force_str(self._attributes) else: - return self._getindex(8) + return force_str(self._getindex(8)) def __set__( self, value): if self.hasOwnAttributes: free(self._attributes) @@ -481,7 +499,7 @@ cdef class GTFProxy(TupleProxy): # Remove white space to prevent a last empty field. fields = [x.strip() for x in attributes.strip().split("; ")] - result = {} + result = collections.OrderedDict() for f in fields: @@ -529,7 +547,7 @@ cdef class GTFProxy(TupleProxy): else: aa.append( '%s %s' % (k,str(v)) ) - a = "; ".join( aa ) + ";" + a = force_bytes("; ".join(aa) + ";") p = a l = len(a) self._attributes = calloc(l + 1, sizeof(char)) @@ -552,9 +570,9 @@ cdef class GTFProxy(TupleProxy): str(self.start+1), str(self.end), toDot(self.score), - self.strand, - self.frame, - self.attributes ) ) + toDot(self.strand), + toDot(self.frame), + self.attributes)) else: return TupleProxy.__str__(self) @@ -638,6 +656,26 @@ cdef class GTFProxy(TupleProxy): r[name] = value self.fromDict(r) + def __cmp__(self, other): + return (self.contig, self.strand, self.start) < \ + (other.contig, other.strand, other.start) + + # python 3 compatibility + def __richcmp__(GTFProxy self, GTFProxy other, int op): + if op == 0: + return (self.contig, self.strand, self.start) < \ + (other.contig, other.strand, other.start) + elif op == 1: + return (self.contig, self.strand, self.start) <= \ + (other.contig, other.strand, other.start) + elif op == 2: + return self.compare(other) == 0 + elif op == 3: + return self.compare(other) != 0 + else: + err_msg = "op {0} isn't implemented yet".format(op) + raise NotImplementedError(err_msg) + cdef class NamedTupleProxy(TupleProxy): @@ -705,8 +743,8 @@ cdef class BedProxy(NamedTupleProxy): # do automatic conversion self.contig = self.fields[0] - self.start = atoi( self.fields[1] ) - self.end = atoi( self.fields[2] ) + self.start = atoi(self.fields[1]) + self.end = atoi(self.fields[2]) # __setattr__ in base class seems to take precedence # hence implement setters in __setattr__ diff --git a/pysam/cutils.pxd b/pysam/cutils.pxd index 36fe554..81e544a 100644 --- a/pysam/cutils.pxd +++ b/pysam/cutils.pxd @@ -32,4 +32,7 @@ cdef extern from "pysam_util.h": int bcftools_main(int argc, char *argv[]) void pysam_set_stderr(int fd) void pysam_unset_stderr() + void pysam_set_stdout(int fd) + void pysam_set_stdout_fn(const char *) + void pysam_unset_stdout() void set_optind(int) diff --git a/pysam/cutils.pyx b/pysam/cutils.pyx index 482db89..7510727 100644 --- a/pysam/cutils.pyx +++ b/pysam/cutils.pyx @@ -14,6 +14,7 @@ from libc.stdlib cimport calloc, free from libc.string cimport strncpy from libc.stdio cimport fprintf, stderr, fflush from libc.stdio cimport stdout as c_stdout +from posix.fcntl cimport open as c_open, O_WRONLY ##################################################################### # hard-coded constants @@ -227,129 +228,75 @@ cpdef parse_region(reference=None, return force_bytes(reference), rstart, rend -@contextmanager -def stdout_redirector(to=os.devnull): - ''' - import os - - with stdout_redirected(to=filename): - print("from Python") - os.system("echo non-Python applications are also supported") - - see http://stackoverflow.com/questions/5081657/how-do-i-prevent-a-c-shared-library-to-print-on-stdout-in-python/17954769#17954769 - ''' - fd = sys.stdout.fileno() - - def _redirect_stdout(to): - # flush C-level stdout - try: - fflush(c_stdout) - sys.stdout.close() - except (OSError, IOError): - # some tools close stdout - # Py3: OSError - # Py2: IOError - pass - - # fd writes to 'to' file - os.dup2(to.fileno(), fd) - # Python writes to fd - if IS_PYTHON3: - sys.stdout = io.TextIOWrapper( - os.fdopen(fd, 'wb')) - else: - sys.stdout = os.fdopen(fd, 'w') - - with os.fdopen(os.dup(fd), 'w') as old_stdout: - _redirect_stdout(to) - try: - yield # allow code to be run with the redirected stdout - finally: - _redirect_stdout(old_stdout) - # restore stdout. - # buffering and flags may be different - -# def stdout_redirector(stream): -# """ -# See discussion in: - -# http://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/ -# """ - -# # The original fd stdout points to. Usually 1 on POSIX systems. -# original_stdout_fd = sys.stdout.fileno() -# print ("original_fd=", original_stdout_fd) -# def _redirect_stdout(to_fd): -# """Redirect stdout to the given file descriptor.""" -# # Flush the C-level buffer stdout -# fflush(c_stdout) -# # Flush and close sys.stdout - also closes the file descriptor -# # (fd) -# sys.stdout.close() -# # Make original_stdout_fd point to the same file as to_fd -# os.dup2(to_fd, original_stdout_fd) -# # Create a new sys.stdout that points to the redirected fd -# if IS_PYTHON3: -# sys.stdout = io.TextIOWrapper( -# os.fdopen(original_stdout_fd, 'wb')) - -# # Save a copy of the original stdout fd in saved_stdout_fd -# saved_stdout_fd = os.dup(original_stdout_fd) -# try: -# # Create a temporary file and redirect stdout to it -# tfile = tempfile.TemporaryFile(mode='w+b') -# _redirect_stdout(tfile.fileno()) -# # Yield to caller, then redirect stdout back to the saved fd -# yield -# _redirect_stdout(saved_stdout_fd) -# # Copy contents of temporary file to the given stream -# tfile.flush() -# tfile.seek(0, io.SEEK_SET) -# stream.write(tfile.read()) -# finally: -# tfile.close() -# os.close(saved_stdout_fd) - - def _pysam_dispatch(collection, method, - args=(), - catch_stdout=True): + args=None, + catch_stdout=True, + save_stdout=None): '''call ``method`` in samtools/bcftools providing arguments in args. - .. note:: - This method redirects stdout to capture it - from samtools. If for some reason stdout disappears - the reason might be in this method. - - .. note:: - This method captures stdout and stderr using temporary files, - which are then read into memory in their entirety. This method - is slow and might cause large memory overhead. - - Catching of stdout can be turned of by setting *catch_stdout* to + Catching of stdout can be turned off by setting *catch_stdout* to False. - See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily - on the topic of redirecting stderr/stdout. - ''' - # note that debugging this module can be a problem - # as stdout/stderr will not appear on the terminal - # some special cases if method == "index": if not os.path.exists(args[0]): raise IOError("No such file or directory: '%s'" % args[0]) + + if args is None: + args = [] + else: + args = list(args) - # redirect stderr and stdout to file + # redirect stderr to file stderr_h, stderr_f = tempfile.mkstemp() pysam_set_stderr(stderr_h) + # redirect stdout to file + if save_stdout: + stdout_f = save_stdout + stdout_h = c_open(force_bytes(stdout_f), + O_WRONLY) + if stdout_h == -1: + raise OSError("error while opening {} for writing".format(stdout_f)) + + pysam_set_stdout_fn(force_bytes(stdout_f)) + pysam_set_stdout(stdout_h) + elif catch_stdout: + stdout_h, stdout_f = tempfile.mkstemp() + + MAP_STDOUT_OPTIONS = { + "samtools": { + "view": "-o {}", + "mpileup": "-o {}", + "depad": "-o {}", + "calmd": "", # uses pysam_stdout_fn + }, + "bcftools": {} + } + + stdout_option = None + if collection == "bcftools": + # in bcftools, most methods accept -o, the exceptions + # are below: + if method not in ("index", "roh", "stats"): + stdout_option = "-o {}" + elif method in MAP_STDOUT_OPTIONS[collection]: + stdout_option = MAP_STDOUT_OPTIONS[collection][method] + + if stdout_option is not None: + os.close(stdout_h) + pysam_set_stdout_fn(force_bytes(stdout_f)) + args.extend(stdout_option.format(stdout_f).split(" ")) + else: + pysam_set_stdout(stdout_h) + else: + pysam_set_stdout_fn("-") + # setup the function call to samtools/bcftools main cdef char ** cargs cdef int i, n, retval, l - n = len(args) method = force_bytes(method) collection = force_bytes(collection) @@ -381,41 +328,40 @@ def _pysam_dispatch(collection, set_optind(0) # call samtools/bcftools - if catch_stdout: - with tempfile.TemporaryFile(mode='w+b') as tfile: - with stdout_redirector(tfile): - if collection == b"samtools": - retval = samtools_main(n + 2, cargs) - elif collection == b"bcftools": - retval = bcftools_main(n + 2, cargs) - tfile.flush() - tfile.seek(0) - # do not force str, as output might be binary, - # for example BAM, VCF.gz, etc. - out_stdout = tfile.read() - else: - if collection == b"samtools": - retval = samtools_main(n + 2, cargs) - elif collection == b"bcftools": - retval = bcftools_main(n + 2, cargs) - out_stdout = None + if collection == b"samtools": + retval = samtools_main(n + 2, cargs) + elif collection == b"bcftools": + retval = bcftools_main(n + 2, cargs) for i from 0 <= i < n: free(cargs[i + 2]) free(cargs) # get error messages + def _collect(fn): + out = [] + try: + with open(fn, "r") as inf: + out = inf.read() + except UnicodeDecodeError: + with open(fn, "rb") as inf: + # read binary output + out = inf.read() + finally: + os.remove(fn) + return out + pysam_unset_stderr() - out_stderr = [] - try: - with open(stderr_f, "r") as inf: - out_stderr = inf.readlines() - except UnicodeDecodeError: - with open( stderr_f, "rb") as inf: - # read binary output - out_stderr = inf.read() - finally: - os.remove(stderr_f) + out_stderr = _collect(stderr_f) + + if save_stdout: + pysam_unset_stdout() + out_stdout = None + elif catch_stdout: + pysam_unset_stdout() + out_stdout = _collect(stdout_f) + else: + out_stdout = None return retval, out_stderr, out_stdout diff --git a/pysam/cvcf.pyx b/pysam/cvcf.pyx index 83d3663..5e2fda2 100644 --- a/pysam/cvcf.pyx +++ b/pysam/cvcf.pyx @@ -114,6 +114,7 @@ cdef class VCFRecord( ctabixproxies.TupleProxy): def __init__(self, vcf): self.vcf = vcf self.encoding = vcf.encoding + # if len(data) != len(self.vcf._samples): # self.vcf.error(str(data), # self.BAD_NUMBER_OF_COLUMNS, @@ -133,7 +134,7 @@ cdef class VCFRecord( ctabixproxies.TupleProxy): def error(self, line, error, opt=None): '''raise error.''' # pass to vcf file for error handling - return self.vcf.error( line, error, opt ) + return self.vcf.error(line, error, opt) cdef update(self, char * buffer, size_t nbytes): '''update internal data. @@ -349,6 +350,7 @@ class VCF(object): if leftalign: self._leftalign = leftalign self._lines = lines self.encoding = "ascii" + self.tabixfile = None def error(self,line,error,opt=None): if error in self._ignored_errors: return @@ -1047,6 +1049,15 @@ class VCF(object): self.tabixfile = pysam.Tabixfile(filename, encoding=encoding) self._parse_header(self.tabixfile.header) + def __del__(self): + self.close() + self.tabixfile = None + + def close(self): + if self.tabixfile: + self.tabixfile.close() + self.tabixfile = None + def fetch(self, reference=None, start=None, diff --git a/pysam/pysam_stream.h b/pysam/pysam_stream.h index 3e93e29..3a4eb16 100644 --- a/pysam/pysam_stream.h +++ b/pysam/pysam_stream.h @@ -5,7 +5,8 @@ // ####################################################### // fastq parsing -KSEQ_INIT(gzFile, gzread) +// KSEQ_INIT(gzFile, gzread) +KSEQ_INIT(BGZF *, bgzf_read) //KSTREAM_INIT( gzFile, gzread, 16384) diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c index e669e1d..94717c8 100644 --- a/pysam/pysam_util.c +++ b/pysam/pysam_util.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bam.h" #include "bam_endian.h" #include "htslib/khash.h" @@ -8,23 +9,52 @@ #include "htslib/knetfile.h" #include "pysam_util.h" -// Definition of pysamerr -#include "stdio.h" -FILE * pysamerr = NULL; + +FILE * pysam_stderr = NULL; +FILE * pysam_stdout = NULL; +const char * pysam_stdout_fn = NULL; +int PYSAM_STDOUT_FILENO = STDOUT_FILENO; + FILE * pysam_set_stderr(int fd) { - if (pysamerr != NULL) - fclose(pysamerr); - pysamerr = fdopen(fd, "w"); - return pysamerr; + if (pysam_stderr != NULL) + fclose(pysam_stderr); + pysam_stderr = fdopen(fd, "w"); + return pysam_stderr; } void pysam_unset_stderr(void) { - if (pysamerr != NULL) - fclose(pysamerr); - pysamerr = fopen("/dev/null", "w"); + if (pysam_stderr != NULL) + fclose(pysam_stderr); + pysam_stderr = fopen("/dev/null", "w"); +} + +FILE * pysam_set_stdout(int fd) +{ + if (pysam_stdout != NULL) + fclose(pysam_stdout); + pysam_stdout = fdopen(fd, "w"); + if (pysam_stdout == NULL) + { + fprintf(pysam_stderr, "could not set stdout to fd %i", fd); + } + PYSAM_STDOUT_FILENO = fd; + return pysam_stdout; +} + +void pysam_set_stdout_fn(const char *fn) +{ + pysam_stdout_fn = fn; +} + +void pysam_unset_stdout(void) +{ + if (pysam_stdout != NULL) + fclose(pysam_stdout); + pysam_stdout = fopen("/dev/null", "w"); + PYSAM_STDOUT_FILENO = STDOUT_FILENO; } void set_optind(int val) diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h index 5f2359f..a30808f 100644 --- a/pysam/pysam_util.h +++ b/pysam/pysam_util.h @@ -1,20 +1,35 @@ #ifndef PYSAM_UTIL_H #define PYSAM_UTIL_H -////////////////////////////////////////////////////////////////// /*! set pysam standard error to point to file descriptor Setting the stderr will close the previous stderr. */ FILE * pysam_set_stderr(int fd); -////////////////////////////////////////////////////////////////// +/*! set pysam standard output to point to file descriptor + + Setting the stderr will close the previous stdout. + */ +FILE * pysam_set_stdout(int fd); + +/*! set pysam standard output to point to filename + + */ +void pysam_set_stdout_fn(const char * fn); + /*! set pysam standard error to /dev/null. Unsetting the stderr will close the previous stderr. */ void pysam_unset_stderr(void); +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void pysam_unset_stdout(void); + int pysam_dispatch(int argc, char *argv[]); void set_optind(int); diff --git a/pysam/tabix_util.c b/pysam/tabix_util.c index f94b09d..bff140e 100644 --- a/pysam/tabix_util.c +++ b/pysam/tabix_util.c @@ -1,8 +1,6 @@ -// Definition of pysamerr #include #include #include -FILE * pysamerr = NULL; #if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) /* diff --git a/pysam/utils.py b/pysam/utils.py index 0e49d54..c5bb539 100644 --- a/pysam/utils.py +++ b/pysam/utils.py @@ -46,14 +46,24 @@ class PysamDispatcher(object): '''execute a samtools command. Keyword arguments: - catch_stdout -- redirect stdout from the samtools command and return as variable (default True) + catch_stdout -- redirect stdout from the samtools command and + return as variable (default True) + save_stdout -- redirect stdout to a filename. raw -- ignore any parsers associated with this samtools command. + split_lines -- return stdout (if catch_stdout is True and stderr + as a list of strings. ''' retval, stderr, stdout = _pysam_dispatch( self.collection, self.dispatch, args, - catch_stdout=kwargs.get("catch_stdout", True)) + catch_stdout=kwargs.get("catch_stdout", True), + save_stdout=kwargs.get("save_stdout", None)) + + if kwargs.get("split_lines", False): + stdout = stdout.splitlines() + if stderr: + stderr = stderr.splitlines() if retval: raise SamtoolsError( @@ -61,8 +71,8 @@ class PysamDispatcher(object): "stdout=%s, stderr=%s" % (self.collection, retval, - "\n".join(stdout), - "\n".join(stderr))) + stdout, + stderr)) self.stderr = stderr @@ -84,5 +94,5 @@ class PysamDispatcher(object): '''return the samtools usage information for this command''' retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch) - return "".join(stderr) + return stderr diff --git a/pysam/version.py b/pysam/version.py index 815e4b9..15cefc4 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,7 +1,7 @@ # pysam versioning information -__version__ = "0.9.0" +__version__ = "0.9.1" -__samtools_version__ = "1.3" +__samtools_version__ = "1.3.1" -__htslib_version__ = "1.3" +__htslib_version__ = "1.3.1" diff --git a/run_tests_travis.sh b/run_tests_travis.sh index d2d9988..f1fcdce 100755 --- a/run_tests_travis.sh +++ b/run_tests_travis.sh @@ -34,21 +34,21 @@ mkdir -p $WORKDIR/external-tools # install htslib cd $WORKDIR/external-tools -curl -L https://github.com/samtools/htslib/releases/download/1.3/htslib-1.3.tar.bz2 > htslib-1.3.tar.bz2 -tar xjvf htslib-1.3.tar.bz2 -cd htslib-1.3 +curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2 +tar xjvf htslib-1.3.1.tar.bz2 +cd htslib-1.3.1 make -PATH=$PATH:$WORKDIR/external-tools/htslib-1.3 -LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3 +PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1 +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1 # install samtools, compile against htslib cd $WORKDIR/external-tools -curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3/samtools-1.3.tar.bz2 > samtools-1.3.tar.bz2 -tar xjvf samtools-1.3.tar.bz2 -cd samtools-1.3 -./configure --with-htslib=../htslib-1.3 +curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2 +tar xjvf samtools-1.3.1.tar.bz2 +cd samtools-1.3.1 +./configure --with-htslib=../htslib-1.3.1 make -PATH=$PATH:$WORKDIR/external-tools/samtools-1.3 +PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1 echo "installed samtools" samtools --version @@ -59,12 +59,12 @@ fi # install bcftools cd $WORKDIR/external-tools -curl -L https://github.com/samtools/bcftools/releases/download/1.3/bcftools-1.3.tar.bz2 > bcftools-1.3.tar.bz2 -tar xjf bcftools-1.3.tar.bz2 -cd bcftools-1.3 -./configure --with-htslib=../htslib-1.3 +curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2 +tar xjf bcftools-1.3.1.tar.bz2 +cd bcftools-1.3.1 +./configure --with-htslib=../htslib-1.3.1 make -PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3 +PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1 echo "installed bcftools" bcftools --version diff --git a/samtools/bam.c b/samtools/bam.c index afab668..4965e24 100644 --- a/samtools/bam.c +++ b/samtools/bam.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -34,15 +36,22 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b) { kstring_t str; str.l = str.m = 0; str.s = NULL; - sam_format1(header, b, &str); + if (sam_format1(header, b, &str) < 0) { + free(str.s); + str.s = NULL; + return NULL; + } return str.s; } -void bam_view1(const bam_header_t *header, const bam1_t *b) +int bam_view1(const bam_header_t *header, const bam1_t *b) { char *s = bam_format1(header, b); - puts(s); + int ret = -1; + if (!s) return -1; + if (puts(s) != EOF) ret = 0; free(s); + return ret; } int bam_validate1(const bam_header_t *header, const bam1_t *b) @@ -103,6 +112,9 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b) last = *cp++; } + if (!ID || !LB) + continue; + // Check it's the correct ID if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t') continue; diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c index a9da5b9..188fe8c 100644 --- a/samtools/bam.c.pysam.c +++ b/samtools/bam.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -36,15 +38,22 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b) { kstring_t str; str.l = str.m = 0; str.s = NULL; - sam_format1(header, b, &str); + if (sam_format1(header, b, &str) < 0) { + free(str.s); + str.s = NULL; + return NULL; + } return str.s; } -void bam_view1(const bam_header_t *header, const bam1_t *b) +int bam_view1(const bam_header_t *header, const bam1_t *b) { char *s = bam_format1(header, b); - puts(s); + int ret = -1; + if (!s) return -1; + if (fputs(s, pysam_stdout) & fputc('\n', pysam_stdout) != EOF) ret = 0; free(s); + return ret; } int bam_validate1(const bam_header_t *header, const bam1_t *b) @@ -105,6 +114,9 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b) last = *cp++; } + if (!ID || !LB) + continue; + // Check it's the correct ID if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t') continue; diff --git a/samtools/bam.h b/samtools/bam.h index 57aa044..e928ce4 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ @copyright Genome Research Ltd. */ -#define BAM_VERSION "1.3" +#define BAM_VERSION "1.3.1" #include #include @@ -322,8 +322,11 @@ extern "C" { */ char *bam_format1(const bam_header_t *header, const bam1_t *b); - /*! @abstract Formats a BAM record and writes it and \n to stdout */ - void bam_view1(const bam_header_t *header, const bam1_t *b); + /*! + @abstract Formats a BAM record and writes it and \n to stdout + @return 0 if successful, -1 on error + */ + int bam_view1(const bam_header_t *header, const bam1_t *b); /*! @abstract Check whether a BAM record is plausibly valid diff --git a/samtools/bam2bcf.c b/samtools/bam2bcf.c index ed433b1..85ce307 100644 --- a/samtools/bam2bcf.c +++ b/samtools/bam2bcf.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c index be3876d..6938ec0 100644 --- a/samtools/bam2bcf.c.pysam.c +++ b/samtools/bam2bcf.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -108,7 +110,7 @@ static int get_position(const bam_pileup1_t *p, int *len) if ( cig==BAM_CHARD_CLIP ) continue; if ( cig==BAM_CPAD ) continue; if ( cig==BAM_CREF_SKIP ) continue; - fprintf(pysamerr,"todo: cigar %d\n", cig); + fprintf(pysam_stderr,"todo: cigar %d\n", cig); assert(0); } *len = n_tot_bases; @@ -479,7 +481,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) double sum = 0; const double log2 = log(2.0); - // fprintf(pysamerr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); + // fprintf(pysam_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); int i; for (i=0; in; i++) { @@ -494,7 +496,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) else tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; sum += tmp; - // fprintf(pysamerr,"oi=%d %e\n", oi,tmp); + // fprintf(pysam_stderr,"oi=%d %e\n", oi,tmp); } call->seg_bias = sum; } @@ -658,7 +660,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int } } -// if (ref_base < 0) fprintf(pysamerr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); +// if (ref_base < 0) fprintf(pysam_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); call->shift = (int)(sum_min + .499); } // combine annotations diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c index e1c45c4..5b353fc 100644 --- a/samtools/bam2bcf_indel.c +++ b/samtools/bam2bcf_indel.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c index 45e1101..21cbb03 100644 --- a/samtools/bam2bcf_indel.c.pysam.c +++ b/samtools/bam2bcf_indel.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -225,7 +227,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla free(aux); // TODO revisit how/whether to control printing this warning if (hts_verbose >= 2) - fprintf(pysamerr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + fprintf(pysam_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); return -1; } types = (int*)calloc(n_types, sizeof(int)); @@ -298,7 +300,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; if (max_i >= 0) r[max_i] = 15; if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysamerr); fputc('\n', pysamerr); + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysam_stderr); fputc('\n', pysam_stderr); } free(ref0); free(cns); } @@ -366,7 +368,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); else ir = est_indelreg(pos, ref, -types[t], 0); if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(pysamerr, "%d, %d, %d\n", pos, types[t], ir); +// fprintf(pysam_stderr, "%d, %d, %d\n", pos, types[t], ir); // realignment for (s = K = 0; s < n; ++s) { // write ref2 @@ -428,11 +430,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } /* for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysamerr); - fputc('\n', pysamerr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysamerr); - fputc('\n', pysamerr); - fprintf(pysamerr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); + fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr); + fputc('\n', pysam_stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr); + fputc('\n', pysam_stderr); + fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); */ } } @@ -488,7 +490,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(pysamerr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); +// fprintf(pysam_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -520,7 +522,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (x == bca->indel_types[j]) break; p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(pysamerr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } } diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index f109447..21220f1 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -30,6 +30,8 @@ DEALINGS IN THE SOFTWARE. */ * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz */ +#include + #include #include #include diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index 6549949..9d9dc40 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -32,6 +32,8 @@ DEALINGS IN THE SOFTWARE. */ * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz */ +#include + #include #include #include @@ -73,26 +75,26 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here int read_file_list(const char *file_list,int *n,char **argv[]); static int usage() { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -a output all positions (including zero depth)\n"); - fprintf(pysamerr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(pysamerr, " -b list of positions or regions\n"); - fprintf(pysamerr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(pysamerr, " -l read length threshold (ignore reads shorter than )\n"); - fprintf(pysamerr, " -d/-m maximum coverage depth [8000]\n"); // the htslib's default - fprintf(pysamerr, " -q base quality threshold\n"); - fprintf(pysamerr, " -Q mapping quality threshold\n"); - fprintf(pysamerr, " -r region\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -a output all positions (including zero depth)\n"); + fprintf(pysam_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); + fprintf(pysam_stderr, " -b list of positions or regions\n"); + fprintf(pysam_stderr, " -f list of input BAM filenames, one per line [null]\n"); + fprintf(pysam_stderr, " -l read length threshold (ignore reads shorter than )\n"); + fprintf(pysam_stderr, " -d/-m maximum coverage depth [8000]\n"); // the htslib's default + fprintf(pysam_stderr, " -q base quality threshold\n"); + fprintf(pysam_stderr, " -Q mapping quality threshold\n"); + fprintf(pysam_stderr, " -r region\n"); - sam_global_opt_help(pysamerr, "-.--."); + sam_global_opt_help(pysam_stderr, "-.--."); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "The output is a simple tab-separated table with three columns: reference name,\n"); - fprintf(pysamerr, "position, and coverage depth. Note that positions with zero coverage may be\n"); - fprintf(pysamerr, "omitted by default; see the -a option.\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); + fprintf(pysam_stderr, "position, and coverage depth. Note that positions with zero coverage may be\n"); + fprintf(pysam_stderr, "omitted by default; see the -a option.\n"); + fprintf(pysam_stderr, "\n"); return 1; } @@ -162,18 +164,18 @@ int main_depth(int argc, char *argv[]) rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; if (baseQ) rf |= SAM_QUAL; if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (data[i]->hdr == NULL) { - fprintf(pysamerr, "Couldn't read header for \"%s\"\n", + fprintf(pysam_stderr, "Couldn't read header for \"%s\"\n", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; @@ -218,10 +220,10 @@ int main_depth(int argc, char *argv[]) while (++last_pos < h->target_len[last_tid]) { if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; - fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); + fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); for (i = 0; i < n; i++) - putchar('\t'), putchar('0'); - putchar('\n'); + fputc('\t', pysam_stdout), fputc('0', pysam_stdout); + fputc('\n', pysam_stdout); } } last_tid++; @@ -233,16 +235,16 @@ int main_depth(int argc, char *argv[]) if (last_pos < beg) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) continue; - fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1); + fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); for (i = 0; i < n; i++) - putchar('\t'), putchar('0'); - putchar('\n'); + fputc('\t', pysam_stdout), fputc('0', pysam_stdout); + fputc('\n', pysam_stdout); } last_tid = tid; last_pos = pos; } - fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster + fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", pos+1); // a customized fprintf(pysam_stdout, ) would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { @@ -250,9 +252,9 @@ int main_depth(int argc, char *argv[]) if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } - printf("\t%d", n_plp[i] - m); // this the depth to output + fprintf(pysam_stdout, "\t%d", n_plp[i] - m); // this the depth to output } - putchar('\n'); + fputc('\n', pysam_stdout); } if (ret < 0) status = EXIT_FAILURE; free(n_plp); free(plp); @@ -265,10 +267,10 @@ int main_depth(int argc, char *argv[]) if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; - fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); + fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); for (i = 0; i < n; i++) - putchar('\t'), putchar('0'); - putchar('\n'); + fputc('\t', pysam_stdout), fputc('0', pysam_stdout); + fputc('\n', pysam_stdout); } last_tid++; last_pos = -1; @@ -296,7 +298,7 @@ depth_end: } #ifdef _MAIN_BAM2DEPTH -int main(int argc, char *argv[]) +int samtools_bam2depth_main(int argc, char *argv[]) { return main_depth(argc, argv); } diff --git a/samtools/bam_addrprg.c b/samtools/bam_addrprg.c index 2b4939f..f7bbfab 100644 --- a/samtools/bam_addrprg.c +++ b/samtools/bam_addrprg.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "samtools.h" diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c index 91fa9cd..2ddd1b1 100644 --- a/samtools/bam_addrprg.c.pysam.c +++ b/samtools/bam_addrprg.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "samtools.h" @@ -95,7 +97,7 @@ static char* basic_unescape(const char* in) if (*in == '\\') { ++in; if (*in == '\0') { - fprintf(pysamerr, "[%s] Unterminated escape sequence.\n", __func__); + fprintf(pysam_stderr, "[%s] Unterminated escape sequence.\n", __func__); free(out); return NULL; } @@ -107,11 +109,11 @@ static char* basic_unescape(const char* in) *ptr = '\t'; break; case 'n': - fprintf(pysamerr, "[%s] \\n in escape sequence is not supported.\n", __func__); + fprintf(pysam_stderr, "[%s] \\n in escape sequence is not supported.\n", __func__); free(out); return NULL; default: - fprintf(pysamerr, "[%s] Unsupported escape sequence.\n", __func__); + fprintf(pysam_stderr, "[%s] Unsupported escape sequence.\n", __func__); free(out); return NULL; } @@ -226,7 +228,7 @@ static void usage(FILE *fp) "\n" "Options:\n" " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n" - " -o FILE Where to write output to [stdout]\n" + " -o FILE Where to write output to [pysam_stdout]\n" " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" ); @@ -238,11 +240,11 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) *opts = NULL; int n; - if (argc == 1) { usage(stdout); return true; } + if (argc == 1) { usage(pysam_stdout); return true; } parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t)); if (! retval ) { - fprintf(pysamerr, "[%s] Out of memory allocating parsed_opts_t\n", __func__); + fprintf(pysam_stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__); return false; } // Set defaults @@ -276,7 +278,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) } else if (strcmp(optarg, "orphan_only") == 0) { retval->mode = orphan_only; } else { - usage(pysamerr); + usage(pysam_stderr); return false; } break; @@ -285,17 +287,17 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) retval->output_name = strdup(optarg); break; case 'h': - usage(stdout); + usage(pysam_stdout); free(retval); return true; case '?': - usage(pysamerr); + usage(pysam_stderr); free(retval); return false; case 'O': default: if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break; - usage(pysamerr); + usage(pysam_stderr); free(retval); return false; } @@ -303,13 +305,13 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) retval->rg_line = ks_release(&rg_line); if (argc-optind < 1) { - fprintf(pysamerr, "You must specify an input file.\n"); - usage(pysamerr); + fprintf(pysam_stderr, "You must specify an input file.\n"); + usage(pysam_stderr); cleanup_opts(retval); return false; } if (retval->rg_id && retval->rg_line) { - fprintf(pysamerr, "The options -r and -R are mutually exclusive.\n"); + fprintf(pysam_stderr, "The options -r and -R are mutually exclusive.\n"); cleanup_opts(retval); return false; } @@ -319,7 +321,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) char* tmp = basic_unescape(retval->rg_line); if ((retval->rg_id = get_rg_id(tmp)) == NULL) { - fprintf(pysamerr, "[%s] The supplied RG line lacks an ID tag.\n", __func__); + fprintf(pysam_stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__); free(tmp); cleanup_opts(retval); return false; @@ -361,7 +363,7 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read) static bool init(const parsed_opts_t* opts, state_t** state_out) { state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { - fprintf(pysamerr, "[init] Out of memory allocating state struct.\n"); + fprintf(pysam_stderr, "[init] Out of memory allocating state struct.\n"); return false; } *state_out = retval; @@ -369,7 +371,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Open files retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in); if (retval->input_file == NULL) { - fprintf(pysamerr, "[init] Could not open input file: %s\n", opts->input_name); + fprintf(pysam_stderr, "[init] Could not open input file: %s\n", opts->input_name); return false; } retval->input_header = sam_hdr_read(retval->input_file); @@ -386,14 +388,14 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Append new RG line to header. // Check does not already exist if ( confirm_rg(retval->output_header, opts->rg_id) ) { - fprintf(pysamerr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); + fprintf(pysam_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); return false; } retval->rg_id = strdup(opts->rg_id); size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; char* new_header = malloc(new_len); if (!new_header) { - fprintf(pysamerr, "[init] Out of memory whilst writing new header.\n"); + fprintf(pysam_stderr, "[init] Out of memory whilst writing new header.\n"); return false; } sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); @@ -404,13 +406,13 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { if (opts->rg_id) { // Confirm what has been supplied exists if ( !confirm_rg(retval->output_header, opts->rg_id) ) { - fprintf(pysamerr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); + fprintf(pysam_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); return false; } retval->rg_id = strdup(opts->rg_id); } else { if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { - fprintf(pysamerr, "No RG specified on command line or in existing header.\n"); + fprintf(pysam_stderr, "No RG specified on command line or in existing header.\n"); return false; } } diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c index 7a67de8..d90b4a8 100644 --- a/samtools/bam_aux.c +++ b/samtools/bam_aux.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "bam.h" diff --git a/samtools/bam_aux.c.pysam.c b/samtools/bam_aux.c.pysam.c index 475c772..c6bd0aa 100644 --- a/samtools/bam_aux.c.pysam.c +++ b/samtools/bam_aux.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "bam.h" diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c index 83cc0fb..5c303d1 100644 --- a/samtools/bam_cat.c +++ b/samtools/bam_cat.c @@ -1,6 +1,6 @@ /* bam_cat.c -- efficiently concatenates bam files. - Copyright (C) 2008-2009, 2011-2013 Genome Research Ltd. + Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd. Modified SAMtools work copyright (C) 2010 Illumina, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -34,6 +34,8 @@ and modified to perform concatenation by Chris Saunders on behalf of Illumina. */ +#include + #include #include #include @@ -43,6 +45,7 @@ Illumina. #include "htslib/sam.h" #include "htslib/cram.h" #include "htslib/khash.h" +#include "samtools.h" KHASH_MAP_INIT_STR(s2i, int) @@ -195,7 +198,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t in = sam_open(fn[i], "rc"); if (in == 0) { - fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + print_error_errno("cat", "fail to open file '%s'", fn[i]); return NULL; } in_c = in->fp.cram; @@ -302,15 +305,18 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) sprintf(vers, "%d.%d", vers_maj, vers_min); out = sam_open(outcram, "wc"); if (out == 0) { - fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram); - return 1; + print_error_errno("cat", "fail to open output file '%s'", outcram); + return -1; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); //fprintf(stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? - sam_hdr_write(out, new_h); + if (sam_hdr_write(out, new_h) < 0) { + print_error_errno("cat", "Couldn't write header"); + return -1; + } for (i = 0; i < nfn; ++i) { samFile *in; @@ -321,7 +327,7 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) in = sam_open(fn[i], "rc"); if (in == 0) { - fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + print_error_errno("cat", "fail to open file '%s'", fn[i]); return -1; } in_c = in->fp.cram; @@ -414,29 +420,37 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) { - BGZF *fp; - uint8_t *buf; + BGZF *fp, *in = NULL; + uint8_t *buf = NULL; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); if (fp == 0) { - fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); - return 1; + print_error_errno("cat", "fail to open output file '%s'", outbam); + return -1; + } + if (h) { + if (bam_hdr_write(fp, h) < 0) { + print_error_errno("cat", "Couldn't write header"); + goto fail; + } } - if (h) bam_hdr_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); + if (!buf) { + fprintf(stderr, "[%s] Couldn't allocate buffer\n", __func__); + goto fail; + } for(i = 0; i < nfn; ++i){ - BGZF *in; bam_hdr_t *old; int len,j; in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); if (in == 0) { - fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); - return -1; + print_error_errno("cat", "fail to open file '%s'", fn[i]); + goto fail; } if (in->is_write) return -1; @@ -444,14 +458,18 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) if (old == NULL) { fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n", __func__, fn[i]); - bgzf_close(in); - return -1; + goto fail; + } + if (h == 0 && i == 0) { + if (bam_hdr_write(fp, old) < 0) { + print_error_errno("cat", "Couldn't write header"); + goto fail; + } } - if (h == 0 && i == 0) bam_hdr_write(fp, old); if (in->block_offset < in->block_length) { - bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); - bgzf_flush(fp); + if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_flush(fp) != 0) goto write_fail; } j=0; @@ -460,16 +478,19 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) int diff=es-len; if(j==0) { fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]); - return -1; + goto fail; } - bgzf_raw_write(fp, ebuf, len); + if (bgzf_raw_write(fp, ebuf, len) < 0) goto write_fail; + memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { - if(j!=0) bgzf_raw_write(fp, ebuf, es); + if(j!=0) { + if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; + } len-= es; memcpy(ebuf,buf+len,es); - bgzf_raw_write(fp, buf, len); + if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail; } j=1; } @@ -482,15 +503,27 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]); fprintf(stderr, " Possible output corruption.\n"); - bgzf_raw_write(fp, ebuf, es); + if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; } } bam_hdr_destroy(old); bgzf_close(in); + in = NULL; } free(buf); - bgzf_close(fp); + if (bgzf_close(fp) < 0) { + fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam); + return -1; + } return 0; + + write_fail: + fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam); + fail: + if (in) bgzf_close(in); + if (fp) bgzf_close(fp); + free(buf); + return -1; } @@ -498,7 +531,7 @@ int main_cat(int argc, char *argv[]) { bam_hdr_t *h = 0; char *outfn = 0; - int c, ret; + int c, ret = 0; samFile *in; while ((c = getopt(argc, argv, "h:o:")) >= 0) { @@ -529,19 +562,21 @@ int main_cat(int argc, char *argv[]) in = sam_open(argv[optind], "r"); if (!in) { - fprintf(stderr, "[%s] ERROR: failed to open file '%s'.\n", __func__, argv[optind]); + print_error_errno("cat", "failed to open file '%s'", argv[optind]); return 1; } switch (hts_get_format(in)->format) { case bam: sam_close(in); - ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + ret = 1; break; case cram: sam_close(in); - ret = cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + ret = 1; break; default: diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c index 004911a..daa0454 100644 --- a/samtools/bam_cat.c.pysam.c +++ b/samtools/bam_cat.c.pysam.c @@ -2,7 +2,7 @@ /* bam_cat.c -- efficiently concatenates bam files. - Copyright (C) 2008-2009, 2011-2013 Genome Research Ltd. + Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd. Modified SAMtools work copyright (C) 2010 Illumina, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -36,6 +36,8 @@ and modified to perform concatenation by Chris Saunders on behalf of Illumina. */ +#include + #include #include #include @@ -45,6 +47,7 @@ Illumina. #include "htslib/sam.h" #include "htslib/cram.h" #include "htslib/khash.h" +#include "samtools.h" KHASH_MAP_INIT_STR(s2i, int) @@ -197,7 +200,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t in = sam_open(fn[i], "rc"); if (in == 0) { - fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + print_error_errno("cat", "fail to open file '%s'", fn[i]); return NULL; } in_c = in->fp.cram; @@ -206,7 +209,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t int vmin = cram_minor_vers(in_c); if ((vers_maj != -1 && vers_maj != vmaj) || (vers_min != -1 && vers_min != vmin)) { - fprintf(pysamerr, "[%s] ERROR: input files have differing version numbers.\n", + fprintf(pysam_stderr, "[%s] ERROR: input files have differing version numbers.\n", __func__); return NULL; } @@ -226,7 +229,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t int added; new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); - //fprintf(pysamerr, "RG %s: #%d -> #%d\n", + //fprintf(pysam_stderr, "RG %s: #%d -> #%d\n", // rg2id_in->id[ki], ki, new_rg); if (added) { @@ -242,7 +245,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t } if (new_rg != ki && rg2id_in->n_id > 1) { - fprintf(pysamerr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", + fprintf(pysam_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", __func__); return NULL; } @@ -304,15 +307,18 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) sprintf(vers, "%d.%d", vers_maj, vers_min); out = sam_open(outcram, "wc"); if (out == 0) { - fprintf(pysamerr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram); - return 1; + print_error_errno("cat", "fail to open output file '%s'", outcram); + return -1; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); - //fprintf(pysamerr, "Creating cram vers %s\n", vers); + //fprintf(pysam_stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? - sam_hdr_write(out, new_h); + if (sam_hdr_write(out, new_h) < 0) { + print_error_errno("cat", "Couldn't write header"); + return -1; + } for (i = 0; i < nfn; ++i) { samFile *in; @@ -323,7 +329,7 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) in = sam_open(fn[i], "rc"); if (in == 0) { - fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + print_error_errno("cat", "fail to open file '%s'", fn[i]); return -1; } in_c = in->fp.cram; @@ -367,7 +373,7 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) // we need to edit the compression header. IF WE CAN. if (new_rg) { int zero = 0; - //fprintf(pysamerr, "Transcode RG %d to %d\n", 0, new_rg); + //fprintf(pysam_stderr, "Transcode RG %d to %d\n", 0, new_rg); cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; @@ -416,44 +422,56 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) { - BGZF *fp; - uint8_t *buf; + BGZF *fp, *in = NULL; + uint8_t *buf = NULL; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; - fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); + fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(pysam_stdout), "w"); if (fp == 0) { - fprintf(pysamerr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); - return 1; + print_error_errno("cat", "fail to open output file '%s'", outbam); + return -1; + } + if (h) { + if (bam_hdr_write(fp, h) < 0) { + print_error_errno("cat", "Couldn't write header"); + goto fail; + } } - if (h) bam_hdr_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); + if (!buf) { + fprintf(pysam_stderr, "[%s] Couldn't allocate buffer\n", __func__); + goto fail; + } for(i = 0; i < nfn; ++i){ - BGZF *in; bam_hdr_t *old; int len,j; in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); if (in == 0) { - fprintf(pysamerr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); - return -1; + print_error_errno("cat", "fail to open file '%s'", fn[i]); + goto fail; } if (in->is_write) return -1; old = bam_hdr_read(in); if (old == NULL) { - fprintf(pysamerr, "[%s] ERROR: couldn't read header for '%s'.\n", + fprintf(pysam_stderr, "[%s] ERROR: couldn't read header for '%s'.\n", __func__, fn[i]); - bgzf_close(in); - return -1; + goto fail; + } + if (h == 0 && i == 0) { + if (bam_hdr_write(fp, old) < 0) { + print_error_errno("cat", "Couldn't write header"); + goto fail; + } } - if (h == 0 && i == 0) bam_hdr_write(fp, old); if (in->block_offset < in->block_length) { - bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); - bgzf_flush(fp); + if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_flush(fp) != 0) goto write_fail; } j=0; @@ -461,17 +479,20 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) if(len= 0) { @@ -508,12 +541,12 @@ int main_cat(int argc, char *argv[]) case 'h': { samFile *fph = sam_open(optarg, "r"); if (fph == 0) { - fprintf(pysamerr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); + fprintf(pysam_stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); return 1; } h = sam_hdr_read(fph); if (h == NULL) { - fprintf(pysamerr, + fprintf(pysam_stderr, "[%s] ERROR: failed to read the header for '%s'.\n", __func__, argv[1]); return 1; @@ -525,30 +558,32 @@ int main_cat(int argc, char *argv[]) } } if (argc - optind < 1) { - fprintf(pysamerr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); + fprintf(pysam_stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); return 1; } in = sam_open(argv[optind], "r"); if (!in) { - fprintf(pysamerr, "[%s] ERROR: failed to open file '%s'.\n", __func__, argv[optind]); + print_error_errno("cat", "failed to open file '%s'", argv[optind]); return 1; } switch (hts_get_format(in)->format) { case bam: sam_close(in); - ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + ret = 1; break; case cram: sam_close(in); - ret = cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) + ret = 1; break; default: sam_close(in); - fprintf(pysamerr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); + fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); return 1; } free(outfn); diff --git a/samtools/bam_color.c b/samtools/bam_color.c index 3983c44..bee19b9 100644 --- a/samtools/bam_color.c +++ b/samtools/bam_color.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "bam.h" diff --git a/samtools/bam_color.c.pysam.c b/samtools/bam_color.c.pysam.c index 78d8510..6bd12c4 100644 --- a/samtools/bam_color.c.pysam.c +++ b/samtools/bam_color.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "bam.h" diff --git a/samtools/bam_flags.c b/samtools/bam_flags.c index ddc7b11..11a82b6 100644 --- a/samtools/bam_flags.c +++ b/samtools/bam_flags.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_flags.c.pysam.c b/samtools/bam_flags.c.pysam.c index f4df057..4895f9a 100644 --- a/samtools/bam_flags.c.pysam.c +++ b/samtools/bam_flags.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -35,24 +37,24 @@ DEALINGS IN THE SOFTWARE. */ static void usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Convert between textual and numeric flag representation\n"); - fprintf(pysamerr, "Usage: samtools flags INT|STR[,...]\n"); - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Flags:\n"); - fprintf(pysamerr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); - fprintf(pysamerr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); - fprintf(pysamerr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); - fprintf(pysamerr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); - fprintf(pysamerr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); - fprintf(pysamerr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); - fprintf(pysamerr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); - fprintf(pysamerr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); - fprintf(pysamerr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); - fprintf(pysamerr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); - fprintf(pysamerr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); - fprintf(pysamerr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Convert between textual and numeric flag representation\n"); + fprintf(pysam_stderr, "Usage: samtools flags INT|STR[,...]\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Flags:\n"); + fprintf(pysam_stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); + fprintf(pysam_stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); + fprintf(pysam_stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); + fprintf(pysam_stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); + fprintf(pysam_stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); + fprintf(pysam_stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); + fprintf(pysam_stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); + fprintf(pysam_stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); + fprintf(pysam_stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); + fprintf(pysam_stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); + fprintf(pysam_stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); + fprintf(pysam_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); + fprintf(pysam_stderr, "\n"); } @@ -62,8 +64,8 @@ int main_flags(int argc, char *argv[]) else { int mask = bam_str2flag(argv[1]); - if ( mask<0 ) { fprintf(pysamerr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } - printf("0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); + if ( mask<0 ) { fprintf(pysam_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } + fprintf(pysam_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); } return 0; } diff --git a/samtools/bam_import.c b/samtools/bam_import.c index d959d0e..96f8158 100644 --- a/samtools/bam_import.c +++ b/samtools/bam_import.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c index c2854f4..3b5dd4a 100644 --- a/samtools/bam_import.c.pysam.c +++ b/samtools/bam_import.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -60,6 +62,6 @@ bam_header_t *sam_header_read2(const char *fn) free(str->s); free(str); header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); free(samstr.s); - fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets); + fprintf(pysam_stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); return header; } diff --git a/samtools/bam_index.c b/samtools/bam_index.c index 83a855d..3a5acf6 100644 --- a/samtools/bam_index.c +++ b/samtools/bam_index.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c index ed902c5..6c0efdc 100644 --- a/samtools/bam_index.c.pysam.c +++ b/samtools/bam_index.c.pysam.c @@ -26,6 +26,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -61,12 +63,12 @@ int bam_index(int argc, char *argv[]) case 'c': csi = 1; break; case 'm': csi = 1; min_shift = atoi(optarg); break; default: - index_usage(pysamerr); + index_usage(pysam_stderr); return 1; } if (optind == argc) { - index_usage(stdout); + index_usage(pysam_stdout); return 1; } @@ -91,31 +93,31 @@ int bam_idxstats(int argc, char *argv[]) samFile* fp; if (argc < 2) { - fprintf(pysamerr, "Usage: samtools idxstats \n"); + fprintf(pysam_stderr, "Usage: samtools idxstats \n"); return 1; } fp = sam_open(argv[1], "r"); - if (fp == NULL) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; } + if (fp == NULL) { fprintf(pysam_stderr, "[%s] fail to open BAM.\n", __func__); return 1; } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(pysamerr, "[%s] failed to read header for '%s'.\n", + fprintf(pysam_stderr, "[%s] failed to read header for '%s'.\n", __func__, argv[1]); return 1; } idx = sam_index_load(fp, argv[1]); - if (idx == NULL) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; } + if (idx == NULL) { fprintf(pysam_stderr, "[%s] fail to load the index.\n", __func__); return 1; } int i; for (i = 0; i < header->n_targets; ++i) { // Print out contig name and length - printf("%s\t%d", header->target_name[i], header->target_len[i]); + fprintf(pysam_stdout, "%s\t%d", header->target_name[i], header->target_len[i]); // Now fetch info about it from the meta bin uint64_t u, v; hts_idx_get_stat(idx, i, &u, &v); - printf("\t%" PRIu64 "\t%" PRIu64 "\n", u, v); + fprintf(pysam_stdout, "\t%" PRIu64 "\t%" PRIu64 "\n", u, v); } // Dump information about unmapped reads - printf("*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx)); + fprintf(pysam_stdout, "*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx)); bam_hdr_destroy(header); hts_idx_destroy(idx); sam_close(fp); diff --git a/samtools/bam_lpileup.c b/samtools/bam_lpileup.c index 0cee701..e20cc92 100644 --- a/samtools/bam_lpileup.c +++ b/samtools/bam_lpileup.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_lpileup.c.pysam.c b/samtools/bam_lpileup.c.pysam.c index bdf4348..9f7f063 100644 --- a/samtools/bam_lpileup.c.pysam.c +++ b/samtools/bam_lpileup.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -179,14 +181,14 @@ static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl } tv->n_pre = l; /* - fprintf(pysamerr, "%d\t", pos+1); + fprintf(pysam_stderr, "%d\t", pos+1); for (i = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; - if (p->is_head) fprintf(pysamerr, "^"); - if (p->is_tail) fprintf(pysamerr, "$"); - fprintf(pysamerr, "%d,", p->level); + if (p->is_head) fprintf(pysam_stderr, "^"); + if (p->is_tail) fprintf(pysam_stderr, "$"); + fprintf(pysam_stderr, "%d,", p->level); } - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); */ return 0; } diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 54c3ed3..5b13b2e 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -1,6 +1,6 @@ /* bam_mate.c -- fix mate pairing information and clean up flags. - Copyright (C) 2009, 2011-2014 Genome Research Ltd. + Copyright (C) 2009, 2011-2016 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -32,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_opts.h" #include "htslib/kstring.h" #include "htslib/sam.h" +#include "samtools.h" /* * This function calculates ct tag for two bams, it assumes they are from the same template and @@ -177,10 +180,10 @@ static void sync_mate(bam1_t* a, bam1_t* b) } // currently, this function ONLY works if each read has one hit -static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) +static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) { bam_hdr_t *header; - bam1_t *b[2]; + bam1_t *b[2] = { NULL, NULL }; int curr, has_prev, pre_end = 0, cur_end = 0; kstring_t str; @@ -188,7 +191,7 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro header = sam_hdr_read(in); if (header == NULL) { fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); - exit(1); + return 1; } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { @@ -199,10 +202,10 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); - exit(1); + goto fail; } } - sam_hdr_write(out, header); + if (sam_hdr_write(out, header) < 0) goto write_fail; b[0] = bam_init1(); b[1] = bam_init1(); @@ -211,12 +214,14 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.flag & BAM_FSECONDARY) { - if ( !remove_reads ) sam_write1(out, header, cur); + if ( !remove_reads ) { + if (sam_write1(out, header, cur) < 0) goto write_fail; + } continue; // skip secondary alignments } if (cur->core.flag & BAM_FSUPPLEMENTARY) { - sam_write1(out, header, cur); + if (sam_write1(out, header, cur) < 0) goto write_fail; continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag @@ -253,14 +258,18 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro // Write out result if ( !remove_reads ) { - sam_write1(out, header, pre); - sam_write1(out, header, cur); + if (sam_write1(out, header, pre) < 0) goto write_fail; + if (sam_write1(out, header, cur) < 0) goto write_fail; } else { // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); - if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre); - if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur); + if(!(pre->core.flag&BAM_FUNMAP)) { + if (sam_write1(out, header, pre) < 0) goto write_fail; + } + if(!(cur->core.flag&BAM_FUNMAP)) { + if (sam_write1(out, header, cur) < 0) goto write_fail; + } } has_prev = 0; } else { // unpaired? clear bad info and write it out @@ -271,7 +280,9 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); - if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre); + if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) { + if (sam_write1(out, header, pre) < 0) goto write_fail; + } } } else has_prev = 1; curr = 1 - curr; @@ -287,12 +298,21 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); - sam_write1(out, header, pre); + if (sam_write1(out, header, pre) < 0) goto write_fail; } bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); + return 0; + + write_fail: + print_error_errno("fixmate", "Couldn't write to output file"); + fail: + bam_hdr_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); + return 1; } void usage(FILE* where) @@ -315,8 +335,8 @@ void usage(FILE* where) int bam_mating(int argc, char *argv[]) { - samFile *in, *out; - int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0; + samFile *in = NULL, *out = NULL; + int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { @@ -333,30 +353,40 @@ int bam_mating(int argc, char *argv[]) case 'c': add_ct = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': usage(stderr); return 1; + case '?': usage(stderr); goto fail; } } - if (optind+1 >= argc) { usage(stderr); return 1; } + if (optind+1 >= argc) { usage(stderr); goto fail; } // init if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { - fprintf(stderr, "[bam_mating] cannot open input file\n"); - return 1; + print_error_errno("fixmate", "cannot open input file"); + goto fail; } sam_open_mode(wmode+1, argv[optind+1], NULL); if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) { - fprintf(stderr, "[bam_mating] cannot open output file\n"); - return 1; + print_error_errno("fixmate", "cannot open output file"); + goto fail; } // run - bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); + res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); // cleanup - sam_close(in); sam_close(out); + sam_close(in); + if (sam_close(out) < 0) { + fprintf(stderr, "[bam_mating] error while closing output file\n"); + res = 1; + } + sam_global_args_free(&ga); + return res; - return 0; + fail: + if (in) sam_close(in); + if (out) sam_close(out); + sam_global_args_free(&ga); + return 1; } diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index c7900a1..a416d07 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -2,7 +2,7 @@ /* bam_mate.c -- fix mate pairing information and clean up flags. - Copyright (C) 2009, 2011-2014 Genome Research Ltd. + Copyright (C) 2009, 2011-2016 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. @@ -26,6 +26,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -34,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_opts.h" #include "htslib/kstring.h" #include "htslib/sam.h" +#include "samtools.h" /* * This function calculates ct tag for two bams, it assumes they are from the same template and @@ -179,18 +182,18 @@ static void sync_mate(bam1_t* a, bam1_t* b) } // currently, this function ONLY works if each read has one hit -static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) +static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) { bam_hdr_t *header; - bam1_t *b[2]; + bam1_t *b[2] = { NULL, NULL }; int curr, has_prev, pre_end = 0, cur_end = 0; kstring_t str; str.l = str.m = 0; str.s = 0; header = sam_hdr_read(in); if (header == NULL) { - fprintf(pysamerr, "[bam_mating_core] ERROR: Couldn't read header\n"); - exit(1); + fprintf(pysam_stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); + return 1; } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { @@ -200,11 +203,11 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro // Looking for SO:coordinate within the @HD line only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { - fprintf(pysamerr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); - exit(1); + fprintf(pysam_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); + goto fail; } } - sam_hdr_write(out, header); + if (sam_hdr_write(out, header) < 0) goto write_fail; b[0] = bam_init1(); b[1] = bam_init1(); @@ -213,12 +216,14 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.flag & BAM_FSECONDARY) { - if ( !remove_reads ) sam_write1(out, header, cur); + if ( !remove_reads ) { + if (sam_write1(out, header, cur) < 0) goto write_fail; + } continue; // skip secondary alignments } if (cur->core.flag & BAM_FSUPPLEMENTARY) { - sam_write1(out, header, cur); + if (sam_write1(out, header, cur) < 0) goto write_fail; continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag @@ -255,14 +260,18 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro // Write out result if ( !remove_reads ) { - sam_write1(out, header, pre); - sam_write1(out, header, cur); + if (sam_write1(out, header, pre) < 0) goto write_fail; + if (sam_write1(out, header, cur) < 0) goto write_fail; } else { // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); - if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre); - if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur); + if(!(pre->core.flag&BAM_FUNMAP)) { + if (sam_write1(out, header, pre) < 0) goto write_fail; + } + if(!(cur->core.flag&BAM_FUNMAP)) { + if (sam_write1(out, header, cur) < 0) goto write_fail; + } } has_prev = 0; } else { // unpaired? clear bad info and write it out @@ -273,7 +282,9 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); - if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre); + if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) { + if (sam_write1(out, header, pre) < 0) goto write_fail; + } } } else has_prev = 1; curr = 1 - curr; @@ -289,12 +300,21 @@ static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int pro pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); - sam_write1(out, header, pre); + if (sam_write1(out, header, pre) < 0) goto write_fail; } bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); + return 0; + + write_fail: + print_error_errno("fixmate", "Couldn't write to output file"); + fail: + bam_hdr_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); + return 1; } void usage(FILE* where) @@ -310,15 +330,15 @@ void usage(FILE* where) fprintf(where, "\n" -"As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n" +"As elsewhere in samtools, use '-' as the filename for stdin/pysam_stdout. The input\n" "file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n" "input is not accepted.\n"); } int bam_mating(int argc, char *argv[]) { - samFile *in, *out; - int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0; + samFile *in = NULL, *out = NULL; + int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { @@ -327,7 +347,7 @@ int bam_mating(int argc, char *argv[]) }; // parse args - if (argc == 1) { usage(stdout); return 0; } + if (argc == 1) { usage(pysam_stdout); return 0; } while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; @@ -335,30 +355,40 @@ int bam_mating(int argc, char *argv[]) case 'c': add_ct = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': usage(pysamerr); return 1; + case '?': usage(pysam_stderr); goto fail; } } - if (optind+1 >= argc) { usage(pysamerr); return 1; } + if (optind+1 >= argc) { usage(pysam_stderr); goto fail; } // init if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { - fprintf(pysamerr, "[bam_mating] cannot open input file\n"); - return 1; + print_error_errno("fixmate", "cannot open input file"); + goto fail; } sam_open_mode(wmode+1, argv[optind+1], NULL); if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) { - fprintf(pysamerr, "[bam_mating] cannot open output file\n"); - return 1; + print_error_errno("fixmate", "cannot open output file"); + goto fail; } // run - bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); + res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); // cleanup - sam_close(in); sam_close(out); + sam_close(in); + if (sam_close(out) < 0) { + fprintf(pysam_stderr, "[bam_mating] error while closing output file\n"); + res = 1; + } + sam_global_args_free(&ga); + return res; - return 0; + fail: + if (in) sam_close(in); + if (out) sam_close(out); + sam_global_args_free(&ga); + return 1; } diff --git a/samtools/bam_md.c b/samtools/bam_md.c index 30f3243..71206cd 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -33,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "kprobaln.h" #include "sam_opts.h" +#include "samtools.h" #define USE_EQUAL 1 #define DROP_TAG 2 @@ -349,11 +352,11 @@ int calmd_usage() { int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; - samFile *fp, *fpout = 0; - bam_hdr_t *header; - faidx_t *fai; - char *ref = 0, mode_w[8], *ref_file; - bam1_t *b; + samFile *fp = NULL, *fpout = NULL; + bam_hdr_t *header = NULL; + faidx_t *fai = NULL; + char *ref = NULL, mode_w[8], *ref_file; + bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -391,35 +394,51 @@ int bam_fillmd(int argc, char *argv[]) if (optind + (ga.reference == NULL) >= argc) return calmd_usage(); fp = sam_open_format(argv[optind], "r", &ga.in); - if (fp == 0) return 1; + if (fp == NULL) { + print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]); + return 1; + } header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - return 1; + goto fail; } fpout = sam_open_format("-", mode_w, &ga.out); - sam_hdr_write(fpout, header); + if (fpout == NULL) { + print_error_errno("calmd", "Failed to open output"); + goto fail; + } + if (sam_hdr_write(fpout, header) < 0) { + print_error_errno("calmd", "Failed to write sam header"); + goto fail; + } ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); if (!fai) { - perror(ref_file); - return 1; + print_error_errno("calmd", "Failed to open reference file '%s'", ref_file); + goto fail; } b = bam_init1(); + if (!b) { + fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n"); + goto fail; + } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; - if (ref == 0) + if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); + if (is_realn || capQ > 10) goto fail; // Would otherwise crash + } } if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag); if (capQ > 10) { @@ -428,7 +447,14 @@ int bam_fillmd(int argc, char *argv[]) } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); } - sam_write1(fpout, header, b); + if (sam_write1(fpout, header, b) < 0) { + print_error_errno("calmd", "failed to write to output file"); + goto fail; + } + } + if (ret < -1) { + fprintf(stderr, "[bam_fillmd] Error reading input.\n"); + goto fail; } bam_destroy1(b); bam_hdr_destroy(header); @@ -436,6 +462,18 @@ int bam_fillmd(int argc, char *argv[]) free(ref); fai_destroy(fai); sam_close(fp); - sam_close(fpout); + if (sam_close(fpout) < 0) { + fprintf(stderr, "[bam_fillmd] error when closing output file\n"); + return 1; + } return 0; + + fail: + free(ref); + if (b) bam_destroy1(b); + if (header) bam_hdr_destroy(header); + if (fai) fai_destroy(fai); + if (fp) sam_close(fp); + if (fpout) sam_close(fpout); + return 1; } diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index 070f9cd..d00c01d 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -35,6 +37,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "kprobaln.h" #include "sam_opts.h" +#include "samtools.h" #define USE_EQUAL 1 #define DROP_TAG 2 @@ -115,7 +118,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { - fprintf(pysamerr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); + fprintf(pysam_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } @@ -133,7 +136,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { - fprintf(pysamerr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); + fprintf(pysam_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } @@ -207,7 +210,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres) if (t > thres) return -1; if (t < 0) t = 0; t = sqrt((thres - t) / thres) * thres; -// fprintf(pysamerr, "%s %lf %d\n", bam_get_qname(b), t, q); +// fprintf(pysam_stderr, "%s %lf %d\n", bam_get_qname(b), t, q); return (int)(t + .499); } @@ -333,7 +336,7 @@ int bam_prob_realn(bam1_t *b, const char *ref) } int calmd_usage() { - fprintf(pysamerr, + fprintf(pysam_stderr, "Usage: samtools calmd [-eubrAES] \n" "Options:\n" " -e change identical bases to '='\n" @@ -344,18 +347,18 @@ int calmd_usage() { " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" " -E extended BAQ for better sensitivity but lower specificity\n"); - sam_global_opt_help(pysamerr, "-...."); + sam_global_opt_help(pysam_stderr, "-...."); return 1; } int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; - samFile *fp, *fpout = 0; - bam_hdr_t *header; - faidx_t *fai; - char *ref = 0, mode_w[8], *ref_file; - bam1_t *b; + samFile *fp = NULL, *fpout = NULL; + bam_hdr_t *header = NULL; + faidx_t *fai = NULL; + char *ref = NULL, mode_w[8], *ref_file; + bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -382,7 +385,7 @@ int bam_fillmd(int argc, char *argv[]) case 'A': baq_flag |= 1; break; case 'E': baq_flag |= 2; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); + fprintf(pysam_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); /* else fall-through */ case '?': return calmd_usage(); } @@ -393,35 +396,51 @@ int bam_fillmd(int argc, char *argv[]) if (optind + (ga.reference == NULL) >= argc) return calmd_usage(); fp = sam_open_format(argv[optind], "r", &ga.in); - if (fp == 0) return 1; + if (fp == NULL) { + print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]); + return 1; + } header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { - fprintf(pysamerr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - return 1; + fprintf(pysam_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); + goto fail; + } + + fpout = sam_open_format(pysam_stdout_fn, mode_w, &ga.out); + if (fpout == NULL) { + print_error_errno("calmd", "Failed to open output"); + goto fail; + } + if (sam_hdr_write(fpout, header) < 0) { + print_error_errno("calmd", "Failed to write sam header"); + goto fail; } - - fpout = sam_open_format("-", mode_w, &ga.out); - sam_hdr_write(fpout, header); ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); if (!fai) { - perror(ref_file); - return 1; + print_error_errno("calmd", "Failed to open reference file '%s'", ref_file); + goto fail; } b = bam_init1(); + if (!b) { + fprintf(pysam_stderr, "[bam_fillmd] Failed to allocate bam struct\n"); + goto fail; + } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; - if (ref == 0) - fprintf(pysamerr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", + if (ref == 0) { // FIXME: Should this always be fatal? + fprintf(pysam_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); + if (is_realn || capQ > 10) goto fail; // Would otherwise crash + } } if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag); if (capQ > 10) { @@ -430,7 +449,14 @@ int bam_fillmd(int argc, char *argv[]) } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); } - sam_write1(fpout, header, b); + if (sam_write1(fpout, header, b) < 0) { + print_error_errno("calmd", "failed to write to output file"); + goto fail; + } + } + if (ret < -1) { + fprintf(pysam_stderr, "[bam_fillmd] Error reading input.\n"); + goto fail; } bam_destroy1(b); bam_hdr_destroy(header); @@ -438,6 +464,18 @@ int bam_fillmd(int argc, char *argv[]) free(ref); fai_destroy(fai); sam_close(fp); - sam_close(fpout); + if (sam_close(fpout) < 0) { + fprintf(pysam_stderr, "[bam_fillmd] error when closing output file\n"); + return 1; + } return 0; + + fail: + free(ref); + if (b) bam_destroy1(b); + if (header) bam_hdr_destroy(header); + if (fai) fai_destroy(fai); + if (fp) sam_close(fp); + if (fpout) sam_close(fpout); + return 1; } diff --git a/samtools/bam_plbuf.c b/samtools/bam_plbuf.c index a579b77..12ea250 100644 --- a/samtools/bam_plbuf.c +++ b/samtools/bam_plbuf.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_plbuf.c.pysam.c b/samtools/bam_plbuf.c.pysam.c index 5b8dda0..76c1ac1 100644 --- a/samtools/bam_plbuf.c.pysam.c +++ b/samtools/bam_plbuf.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index 9e00836..dc12bf3 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -785,7 +787,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -b, --bam-list FILE list of input BAM filenames, one per line\n" " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" -" -d, --max-depth INT max per-BAM depth; avoids excessive memory usage [%d]\n", mplp->max_depth); +" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); fprintf(fp, " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" " -f, --fasta-ref FILE faidx indexed reference sequence file\n" @@ -826,7 +828,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); fprintf(fp, " -I, --skip-indels do not perform indel calling\n" -" -L, --max-idepth INT maximum per-sample depth for INDEL calling [%d]\n", mplp->max_indel_depth); +" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); fprintf(fp, " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index bafbb92..650e818 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -248,7 +250,7 @@ static int mplp_func(void *data, bam1_t *b) if (ma->conf->fai && b->core.tid >= 0) { has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence - fprintf(pysamerr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", + fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", __func__, b->core.pos, ref_len, b->core.tid); skip = 1; continue; @@ -285,7 +287,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); if (id < 0 || id >= m->n) { assert(q); // otherwise a bug - fprintf(pysamerr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); + fprintf(pysam_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); exit(EXIT_FAILURE); } if (m->n_plp[id] == m->m_plp[id]) { @@ -336,7 +338,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) sm = bam_smpl_init(); if (n == 0) { - fprintf(pysamerr,"[%s] no input file/data given\n", __func__); + fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } @@ -347,15 +349,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); if ( !data[i]->fp ) { - fprintf(pysamerr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); + fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { - fprintf(pysamerr, "[%s] failed to process %s: %s\n", + fprintf(pysam_stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } @@ -363,7 +365,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { - fprintf(pysamerr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(EXIT_FAILURE); } bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); @@ -372,11 +374,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == NULL) { - fprintf(pysamerr, "[%s] fail to load index for %s\n", __func__, fn[i]); + fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(EXIT_FAILURE); } if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) { - fprintf(pysamerr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); + fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(EXIT_FAILURE); } if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; @@ -401,7 +403,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); - fprintf(pysamerr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); + fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { @@ -413,7 +415,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { - fprintf(pysamerr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); + fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } @@ -529,10 +531,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } else { - pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; + pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : pysam_stdout; if (pileup_fp == NULL) { - fprintf(pysamerr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); + fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(EXIT_FAILURE); } } @@ -542,10 +544,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) - fprintf(pysamerr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); + fprintf(pysam_stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; - fprintf(pysamerr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); + fprintf(pysam_stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); @@ -639,7 +641,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if ( c < conf->min_baseQ ) continue; if (last++) putc(',', pileup_fp); - fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... + fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow... } } } @@ -695,7 +697,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) FILE *fh = fopen(file_list,"r"); if ( !fh ) { - fprintf(pysamerr,"%s: %s\n", file_list,strerror(errno)); + fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno)); return 1; } @@ -717,9 +719,9 @@ int read_file_list(const char *file_list,int *n,char **argv[]) for (i=0; imax_depth); +" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); fprintf(fp, " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" " -f, --fasta-ref FILE faidx indexed reference sequence file\n" @@ -828,7 +830,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); fprintf(fp, " -I, --skip-indels do not perform indel calling\n" -" -L, --max-idepth INT maximum per-sample depth for INDEL calling [%d]\n", mplp->max_indel_depth); +" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); fprintf(fp, " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, @@ -921,11 +923,11 @@ int bam_mpileup(int argc, char *argv[]) case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : mplp.rflag_require = bam_str2flag(optarg); - if ( mplp.rflag_require<0 ) { fprintf(pysamerr,"Could not parse --rf %s\n", optarg); return 1; } + if ( mplp.rflag_require<0 ) { fprintf(pysam_stderr,"Could not parse --rf %s\n", optarg); return 1; } break; case 2 : mplp.rflag_filter = bam_str2flag(optarg); - if ( mplp.rflag_filter<0 ) { fprintf(pysamerr,"Could not parse --ff %s\n", optarg); return 1; } + if ( mplp.rflag_filter<0 ) { fprintf(pysam_stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; @@ -949,9 +951,9 @@ int bam_mpileup(int argc, char *argv[]) case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; break; case 'B': mplp.flag &= ~MPLP_REALN; break; - case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(pysamerr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break; - case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(pysamerr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break; - case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(pysamerr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break; + case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(pysam_stderr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break; + case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(pysam_stderr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break; + case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(pysam_stderr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; @@ -981,7 +983,7 @@ int bam_mpileup(int argc, char *argv[]) char buf[1024]; mplp.rghash = khash_str2int_init(); if ((fp_rg = fopen(optarg, "r")) == NULL) - fprintf(pysamerr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); + fprintf(pysam_stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... khash_str2int_inc(mplp.rghash, strdup(buf)); fclose(fp_rg); @@ -992,7 +994,7 @@ int bam_mpileup(int argc, char *argv[]) if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break; /* else fall-through */ case '?': - print_usage(pysamerr, &mplp); + print_usage(pysam_stderr, &mplp); return 1; } } @@ -1004,13 +1006,13 @@ int bam_mpileup(int argc, char *argv[]) if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { - fprintf(pysamerr,"Error: The -B option cannot be combined with -E\n"); + fprintf(pysam_stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { - print_usage(pysamerr, &mplp); + print_usage(pysam_stderr, &mplp); return 1; } int ret; diff --git a/samtools/bam_quickcheck.c b/samtools/bam_quickcheck.c index 8d1e7ef..6c3c664 100644 --- a/samtools/bam_quickcheck.c +++ b/samtools/bam_quickcheck.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -36,6 +38,21 @@ static void usage_quickcheck(FILE *write_to) "Options:\n" " -v verbose output (repeat for more verbosity)\n" "\n" +"Notes:\n" +"\n" +"1. In order to use this command effectively, you should check its exit status;\n" +" without any -v options it will NOT print any output, even when some files\n" +" fail the check. One way to use quickcheck might be as a check that all\n" +" BAM files in a directory are okay:\n" +"\n" +"\tsamtools quickcheck *.bam && echo 'all ok' \\\n" +"\t || echo 'fail!'\n" +"\n" +" To also determine which files have failed, use the -v option:\n" +"\n" +"\tsamtools quickcheck -v *.bam > bad_bams.fofn \\\n" +"\t && echo 'all ok' \\\n" +"\t || echo 'some files failed check, see bad_bams.fofn'\n" ); } @@ -121,7 +138,10 @@ int main_quickcheck(int argc, char** argv) } } - hts_close(hts_fp); + if (hts_close(hts_fp) < 0) { + file_state |= 32; + if (verbose >= 2) fprintf(stderr, "%s did not close cleanly\n", fn); + } } if (file_state > 0 && verbose >= 1) { diff --git a/samtools/bam_quickcheck.c.pysam.c b/samtools/bam_quickcheck.c.pysam.c index b589d46..26dbeb9 100644 --- a/samtools/bam_quickcheck.c.pysam.c +++ b/samtools/bam_quickcheck.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -38,6 +40,21 @@ static void usage_quickcheck(FILE *write_to) "Options:\n" " -v verbose output (repeat for more verbosity)\n" "\n" +"Notes:\n" +"\n" +"1. In order to use this command effectively, you should check its exit status;\n" +" without any -v options it will NOT print any output, even when some files\n" +" fail the check. One way to use quickcheck might be as a check that all\n" +" BAM files in a directory are okay:\n" +"\n" +"\tsamtools quickcheck *.bam && echo 'all ok' \\\n" +"\t || echo 'fail!'\n" +"\n" +" To also determine which files have failed, use the -v option:\n" +"\n" +"\tsamtools quickcheck -v *.bam > bad_bams.fofn \\\n" +"\t && echo 'all ok' \\\n" +"\t || echo 'some files failed check, see bad_bams.fofn'\n" ); } @@ -54,7 +71,7 @@ int main_quickcheck(int argc, char** argv) verbose++; break; default: - usage_quickcheck(pysamerr); + usage_quickcheck(pysam_stderr); return 1; } } @@ -63,12 +80,12 @@ int main_quickcheck(int argc, char** argv) argv += optind; if (argc < 1) { - usage_quickcheck(stdout); + usage_quickcheck(pysam_stdout); return 1; } if (verbose >= 2) { - fprintf(pysamerr, "verbosity set to %d\n", verbose); + fprintf(pysam_stderr, "verbosity set to %d\n", verbose); } if (verbose >= 4) { @@ -82,52 +99,55 @@ int main_quickcheck(int argc, char** argv) char* fn = argv[i]; int file_state = 0; - if (verbose >= 3) fprintf(pysamerr, "checking %s\n", fn); + if (verbose >= 3) fprintf(pysam_stderr, "checking %s\n", fn); // attempt to open htsFile *hts_fp = hts_open(fn, "r"); if (hts_fp == NULL) { - if (verbose >= 2) fprintf(pysamerr, "%s could not be opened for reading\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading\n", fn); file_state |= 2; } else { - if (verbose >= 3) fprintf(pysamerr, "opened %s\n", fn); + if (verbose >= 3) fprintf(pysam_stderr, "opened %s\n", fn); // make sure we have sequence data const htsFormat *fmt = hts_get_format(hts_fp); if (fmt->category != sequence_data ) { - if (verbose >= 2) fprintf(pysamerr, "%s was not identified as sequence data\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data\n", fn); file_state |= 4; } else { - if (verbose >= 3) fprintf(pysamerr, "%s is sequence data\n", fn); + if (verbose >= 3) fprintf(pysam_stderr, "%s is sequence data\n", fn); // check header bam_hdr_t *header = sam_hdr_read(hts_fp); if (header->n_targets <= 0) { - if (verbose >= 2) fprintf(pysamerr, "%s had no targets in header\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header\n", fn); file_state |= 8; } else { - if (verbose >= 3) fprintf(pysamerr, "%s has %d targets in header\n", fn, header->n_targets); + if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header\n", fn, header->n_targets); } // only check EOF on BAM for now // TODO implement and use hts_check_EOF() to include CRAM support if (fmt->format == bam) { if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) { - if (verbose >= 2) fprintf(pysamerr, "%s was missing EOF block\n", fn); + if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block\n", fn); file_state |= 16; } else { - if (verbose >= 3) fprintf(pysamerr, "%s has good EOF block\n", fn); + if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block\n", fn); } } } - hts_close(hts_fp); + if (hts_close(hts_fp) < 0) { + file_state |= 32; + if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly\n", fn); + } } if (file_state > 0 && verbose >= 1) { - fprintf(stdout, "%s\n", fn); + fprintf(pysam_stdout, "%s\n", fn); } ret |= file_state; } diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c index dc43807..0469c06 100644 --- a/samtools/bam_reheader.c +++ b/samtools/bam_reheader.c @@ -1,7 +1,7 @@ /* bam_reheader.c -- reheader subcommand. Copyright (C) 2010 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012-2015 Genome Research Ltd. Author: Heng Li @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -43,47 +45,77 @@ DEALINGS IN THE SOFTWARE. */ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, const char *arg_list, int add_PG) { - BGZF *fp; + BGZF *fp = NULL; ssize_t len; - uint8_t *buf; + uint8_t *buf = NULL; + SAM_hdr *sh = NULL; if (in->is_write) return -1; buf = malloc(BUF_SIZE); + if (!buf) { + fprintf(stderr, "Out of memory\n"); + return -1; + } if (bam_hdr_read(in) == NULL) { fprintf(stderr, "Couldn't read header\n"); - free(buf); - return -1; + goto fail; } fp = bgzf_fdopen(fd, "w"); + if (!fp) { + print_error_errno("reheader", "Couldn't open output file"); + goto fail; + } if (add_PG) { // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. - SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text); + sh = sam_hdr_parse_(h->text, h->l_text); + if (!sh) + goto fail; if (sam_hdr_add_PG(sh, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) - return -1; + goto fail; free(h->text); h->text = strdup(sam_hdr_str(sh)); h->l_text = sam_hdr_length(sh); if (!h->text) - return -1; + goto fail; sam_hdr_free(sh); + sh = NULL; } - bam_hdr_write(fp, h); + if (bam_hdr_write(fp, h) < 0) { + print_error_errno("reheader", "Couldn't write header"); + goto fail; + } if (in->block_offset < in->block_length) { - bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); - bgzf_flush(fp); + if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_flush(fp) < 0) goto write_fail; + } + while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { + if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail; + } + if (len < 0) { + fprintf(stderr, "[%s] Error reading input file\n", __func__); + goto fail; } - while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) - bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; - bgzf_close(fp); + if (bgzf_close(fp) < 0) { + fprintf(stderr, "[%s] Error closing output file\n", __func__); + return -1; + } return 0; + + write_fail: + print_error_errno("reheader", "Error writing to output file"); + fail: + bgzf_close(fp); + free(buf); + sam_hdr_free(sh); + return -1; } /* @@ -445,7 +477,7 @@ int main_reheader(int argc, char *argv[]) { // read the header samFile *fph = sam_open(argv[optind], "r"); if (fph == 0) { - fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[optind]); + print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]); return 1; } h = sam_hdr_read(fph); @@ -458,7 +490,7 @@ int main_reheader(int argc, char *argv[]) } in = sam_open(argv[optind+1], inplace?"r+":"r"); if (in == 0) { - fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[optind+1]); + print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); return 1; } if (hts_get_format(in)->format == bam) { diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index 0519137..16990e6 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -3,7 +3,7 @@ /* bam_reheader.c -- reheader subcommand. Copyright (C) 2010 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012-2015 Genome Research Ltd. Author: Heng Li @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -45,51 +47,81 @@ DEALINGS IN THE SOFTWARE. */ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, const char *arg_list, int add_PG) { - BGZF *fp; + BGZF *fp = NULL; ssize_t len; - uint8_t *buf; + uint8_t *buf = NULL; + SAM_hdr *sh = NULL; if (in->is_write) return -1; buf = malloc(BUF_SIZE); - if (bam_hdr_read(in) == NULL) { - fprintf(pysamerr, "Couldn't read header\n"); - free(buf); + if (!buf) { + fprintf(pysam_stderr, "Out of memory\n"); return -1; } + if (bam_hdr_read(in) == NULL) { + fprintf(pysam_stderr, "Couldn't read header\n"); + goto fail; + } fp = bgzf_fdopen(fd, "w"); + if (!fp) { + print_error_errno("reheader", "Couldn't open output file"); + goto fail; + } if (add_PG) { // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. - SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text); + sh = sam_hdr_parse_(h->text, h->l_text); + if (!sh) + goto fail; if (sam_hdr_add_PG(sh, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) - return -1; + goto fail; free(h->text); h->text = strdup(sam_hdr_str(sh)); h->l_text = sam_hdr_length(sh); if (!h->text) - return -1; + goto fail; sam_hdr_free(sh); + sh = NULL; } - bam_hdr_write(fp, h); + if (bam_hdr_write(fp, h) < 0) { + print_error_errno("reheader", "Couldn't write header"); + goto fail; + } if (in->block_offset < in->block_length) { - bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); - bgzf_flush(fp); + if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; + if (bgzf_flush(fp) < 0) goto write_fail; + } + while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { + if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail; + } + if (len < 0) { + fprintf(pysam_stderr, "[%s] Error reading input file\n", __func__); + goto fail; } - while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) - bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; - bgzf_close(fp); + if (bgzf_close(fp) < 0) { + fprintf(pysam_stderr, "[%s] Error closing output file\n", __func__); + return -1; + } return 0; + + write_fail: + print_error_errno("reheader", "Error writing to output file"); + fail: + bgzf_close(fp); + free(buf); + sam_hdr_free(sh); + return -1; } /* - * Reads a file and outputs a new CRAM file to stdout with 'h' + * Reads a file and outputs a new CRAM file to pysam_stdout with 'h' * replaced as the header. No checks are made to the validity. * * FIXME: error checking @@ -173,7 +205,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { - fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__, + fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } @@ -206,7 +238,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list goto err; if (cram_block_get_uncomp_size(b) < header_len+4) { - fprintf(pysamerr, "New header will not fit. Use non-inplace version (%d > %d)\n", + fprintf(pysam_stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", header_len+4, cram_block_get_uncomp_size(b)); ret = -2; goto err; @@ -269,7 +301,7 @@ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { - fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__, + fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } @@ -341,7 +373,7 @@ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list goto err; if (old_container_sz != container_sz) { - fprintf(pysamerr, "Quirk of fate makes this troublesome! " + fprintf(pysam_stderr, "Quirk of fate makes this troublesome! " "Please use non-inplace version.\n"); goto err; } @@ -360,7 +392,7 @@ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list goto err; if (cram_block_size(b) > cram_container_get_length(c)) { - fprintf(pysamerr, "New header will not fit. Use non-inplace version" + fprintf(pysam_stderr, "New header will not fit. Use non-inplace version" " (%d > %d)\n", (int)cram_block_size(b), cram_container_get_length(c)); ret = -2; @@ -398,7 +430,7 @@ int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); default: - fprintf(pysamerr, "[%s] unsupported CRAM version %d\n", __func__, + fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); return -1; } @@ -412,7 +444,7 @@ static void usage(FILE *fp, int ret) { "Options:\n" " -P, --no-PG Do not generate an @PG header line.\n" " -i, --in-place Modify the bam/cram file directly.\n" - " (Defaults to outputting to stdout.)\n"); + " (Defaults to outputting to pysam_stdout.)\n"); exit(ret); } @@ -431,41 +463,40 @@ int main_reheader(int argc, char *argv[]) }; while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) { - fprintf(stderr, " %i %c %s\n", optind, c, argv[optind-1]); switch (c) { case 'P': add_PG = 0; break; case 'i': inplace = 1; break; - case 'h': usage(stdout, 0); break; + case 'h': usage(pysam_stdout, 0); break; default: - fprintf(pysamerr, "Invalid option '%c'\n", c); - usage(pysamerr, 1); + fprintf(pysam_stderr, "Invalid option '%c'\n", c); + usage(pysam_stderr, 1); } } if (argc - optind != 2) - usage(pysamerr, 1); + usage(pysam_stderr, 1); { // read the header samFile *fph = sam_open(argv[optind], "r"); if (fph == 0) { - fprintf(pysamerr, "[%s] fail to read the header from %s.\n", __func__, argv[optind]); + print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]); return 1; } h = sam_hdr_read(fph); sam_close(fph); if (h == NULL) { - fprintf(pysamerr, "[%s] failed to read the header for '%s'.\n", + fprintf(pysam_stderr, "[%s] failed to read the header for '%s'.\n", __func__, argv[1]); return 1; } } in = sam_open(argv[optind+1], inplace?"r+":"r"); if (in == 0) { - fprintf(pysamerr, "[%s] fail to open file %s.\n", __func__, argv[optind+1]); + print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); return 1; } if (hts_get_format(in)->format == bam) { - r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, add_PG); + r = bam_reheader(in->fp.bgzf, h, fileno(pysam_stdout), arg_list, add_PG); } else { if (inplace) r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG); diff --git a/samtools/bam_rmdup.c b/samtools/bam_rmdup.c index cdca878..57612b4 100644 --- a/samtools/bam_rmdup.c +++ b/samtools/bam_rmdup.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -30,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/sam.h" #include "sam_opts.h" +#include "samtools.h" #include "bam.h" // for bam_get_library typedef bam1_t *bam1_p; @@ -60,14 +63,24 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) stack->a[stack->n++] = b; } -static inline void dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) +static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) { int i; for (i = 0; i != stack->n; ++i) { - sam_write1(out, hdr, stack->a[i]); + if (sam_write1(out, hdr, stack->a[i]) < 0) return -1; bam_destroy1(stack->a[i]); + stack->a[i] = NULL; } stack->n = 0; + return 0; +} + +static inline void clear_stack(tmp_stack_t *stack) { + int i; + if (!stack->a) return; + for (i = 0; i != stack->n; ++i) { + bam_destroy1(stack->a[i]); + } } static void clear_del_set(khash_t(name) *del_set) @@ -114,25 +127,29 @@ static inline int sum_qual(const bam1_t *b) return q; } -void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) +int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) { - bam1_t *b; - int last_tid = -1, last_pos = -1; + bam1_t *b = NULL; + int last_tid = -1, last_pos = -1, r; tmp_stack_t stack; khint_t k; - khash_t(lib) *aux; - khash_t(name) *del_set; + khash_t(lib) *aux = NULL; + khash_t(name) *del_set = NULL; + memset(&stack, 0, sizeof(tmp_stack_t)); aux = kh_init(lib); del_set = kh_init(name); b = bam_init1(); - memset(&stack, 0, sizeof(tmp_stack_t)); + if (!aux || !del_set || !b) { + perror(__func__); + goto fail; + } kh_resize(name, del_set, 4 * BUFFER_SIZE); - while (sam_read1(in, hdr, b) >= 0) { + while ((r = sam_read1(in, hdr, b)) >= 0) { bam1_core_t *c = &b->core; if (c->tid != last_tid || last_pos != c->pos) { - dump_best(&stack, out, hdr); // write the result + if (dump_best(&stack, out, hdr) < 0) goto write_fail; // write the result clear_best(aux, BUFFER_SIZE); if (c->tid != last_tid) { clear_best(aux, 0); @@ -141,8 +158,10 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) clear_del_set(del_set); } if ((int)c->tid == -1) { // append unmapped reads - sam_write1(out, hdr, b); - while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b); + if (sam_write1(out, hdr, b) < 0) goto write_fail; + while ((r = sam_read1(in, hdr, b)) >= 0) { + if (sam_write1(out, hdr, b) < 0) goto write_fail; + } break; } last_tid = c->tid; @@ -150,7 +169,7 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) } } if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { - sam_write1(out, hdr, b); + if (sam_write1(out, hdr, b) < 0) goto write_fail; } else if (c->isize > 0) { // paired, head uint64_t key = (uint64_t)c->pos<<32 | c->isize; const char *lib; @@ -178,19 +197,26 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) if (k != kh_end(del_set)) { free((char*)kh_key(del_set, k)); kh_del(name, del_set, k); - } else sam_write1(out, hdr, b); + } else { + if (sam_write1(out, hdr, b) < 0) goto write_fail; + } } last_pos = c->pos; } + if (r < -1) { + fprintf(stderr, "[%s] failed to read input file\n", __func__); + goto fail; + } for (k = kh_begin(aux); k != kh_end(aux); ++k) { if (kh_exist(aux, k)) { lib_aux_t *q = &kh_val(aux, k); - dump_best(&stack, out, hdr); + if (dump_best(&stack, out, hdr) < 0) goto write_fail; fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); kh_destroy(pos, q->best_hash); free((char*)kh_key(aux, k)); + kh_del(lib, aux, k); } } kh_destroy(lib, aux); @@ -199,9 +225,32 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) kh_destroy(name, del_set); free(stack.a); bam_destroy1(b); + return 0; + + write_fail: + print_error_errno("rmdup", "failed to write record"); + fail: + clear_stack(&stack); + free(stack.a); + if (aux) { + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + kh_destroy(pos, q->best_hash); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + } + if (del_set) { + clear_del_set(del_set); + kh_destroy(name, del_set); + } + bam_destroy1(b); + return 1; } -void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); +int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); static int rmdup_usage(void) { fprintf(stderr, "\n"); @@ -215,7 +264,7 @@ static int rmdup_usage(void) { int bam_rmdup(int argc, char *argv[]) { - int c, is_se = 0, force_se = 0; + int c, ret, is_se = 0, force_se = 0; samFile *in, *out; bam_hdr_t *header; char wmode[3] = {'w', 'b', 0}; @@ -239,6 +288,10 @@ int bam_rmdup(int argc, char *argv[]) return rmdup_usage(); in = sam_open_format(argv[optind], "r", &ga.in); + if (!in) { + print_error_errno("rmdup", "failed to open \"%s\" for input", argv[optind]); + return 1; + } header = sam_hdr_read(in); if (header == NULL || header->n_targets == 0) { fprintf(stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); @@ -247,15 +300,23 @@ int bam_rmdup(int argc, char *argv[]) sam_open_mode(wmode+1, argv[optind+1], NULL); out = sam_open_format(argv[optind+1], wmode, &ga.out); - if (in == 0 || out == 0) { - fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + if (!out) { + print_error_errno("rmdup", "failed to open \"%s\" for output", argv[optind+1]); + return 1; + } + if (sam_hdr_write(out, header) < 0) { + print_error_errno("rmdup", "failed to write header"); return 1; } - sam_hdr_write(out, header); - if (is_se) bam_rmdupse_core(in, header, out, force_se); - else bam_rmdup_core(in, header, out); + if (is_se) ret = bam_rmdupse_core(in, header, out, force_se); + else ret = bam_rmdup_core(in, header, out); + bam_hdr_destroy(header); - sam_close(in); sam_close(out); - return 0; + sam_close(in); + if (sam_close(out) < 0) { + fprintf(stderr, "[bam_rmdup] error closing output file\n"); + ret = 1; + } + return ret; } diff --git a/samtools/bam_rmdup.c.pysam.c b/samtools/bam_rmdup.c.pysam.c index 4ece6f2..3c16025 100644 --- a/samtools/bam_rmdup.c.pysam.c +++ b/samtools/bam_rmdup.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -32,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/sam.h" #include "sam_opts.h" +#include "samtools.h" #include "bam.h" // for bam_get_library typedef bam1_t *bam1_p; @@ -62,14 +65,24 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) stack->a[stack->n++] = b; } -static inline void dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) +static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) { int i; for (i = 0; i != stack->n; ++i) { - sam_write1(out, hdr, stack->a[i]); + if (sam_write1(out, hdr, stack->a[i]) < 0) return -1; bam_destroy1(stack->a[i]); + stack->a[i] = NULL; } stack->n = 0; + return 0; +} + +static inline void clear_stack(tmp_stack_t *stack) { + int i; + if (!stack->a) return; + for (i = 0; i != stack->n; ++i) { + bam_destroy1(stack->a[i]); + } } static void clear_del_set(khash_t(name) *del_set) @@ -116,43 +129,49 @@ static inline int sum_qual(const bam1_t *b) return q; } -void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) +int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) { - bam1_t *b; - int last_tid = -1, last_pos = -1; + bam1_t *b = NULL; + int last_tid = -1, last_pos = -1, r; tmp_stack_t stack; khint_t k; - khash_t(lib) *aux; - khash_t(name) *del_set; + khash_t(lib) *aux = NULL; + khash_t(name) *del_set = NULL; + memset(&stack, 0, sizeof(tmp_stack_t)); aux = kh_init(lib); del_set = kh_init(name); b = bam_init1(); - memset(&stack, 0, sizeof(tmp_stack_t)); + if (!aux || !del_set || !b) { + perror(__func__); + goto fail; + } kh_resize(name, del_set, 4 * BUFFER_SIZE); - while (sam_read1(in, hdr, b) >= 0) { + while ((r = sam_read1(in, hdr, b)) >= 0) { bam1_core_t *c = &b->core; if (c->tid != last_tid || last_pos != c->pos) { - dump_best(&stack, out, hdr); // write the result + if (dump_best(&stack, out, hdr) < 0) goto write_fail; // write the result clear_best(aux, BUFFER_SIZE); if (c->tid != last_tid) { clear_best(aux, 0); if (kh_size(del_set)) { // check - fprintf(pysamerr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + fprintf(pysam_stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); clear_del_set(del_set); } if ((int)c->tid == -1) { // append unmapped reads - sam_write1(out, hdr, b); - while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b); + if (sam_write1(out, hdr, b) < 0) goto write_fail; + while ((r = sam_read1(in, hdr, b)) >= 0) { + if (sam_write1(out, hdr, b) < 0) goto write_fail; + } break; } last_tid = c->tid; - fprintf(pysamerr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); + fprintf(pysam_stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); } } if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { - sam_write1(out, hdr, b); + if (sam_write1(out, hdr, b) < 0) goto write_fail; } else if (c->isize > 0) { // paired, head uint64_t key = (uint64_t)c->pos<<32 | c->isize; const char *lib; @@ -170,7 +189,7 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) bam_copy1(p, b); // replaced as b } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed if (ret == 0) - fprintf(pysamerr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); + fprintf(pysam_stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); } else { // not found in best_hash kh_val(q->best_hash, k) = bam_dup1(b); stack_insert(&stack, kh_val(q->best_hash, k)); @@ -180,19 +199,26 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) if (k != kh_end(del_set)) { free((char*)kh_key(del_set, k)); kh_del(name, del_set, k); - } else sam_write1(out, hdr, b); + } else { + if (sam_write1(out, hdr, b) < 0) goto write_fail; + } } last_pos = c->pos; } + if (r < -1) { + fprintf(pysam_stderr, "[%s] failed to read input file\n", __func__); + goto fail; + } for (k = kh_begin(aux); k != kh_end(aux); ++k) { if (kh_exist(aux, k)) { lib_aux_t *q = &kh_val(aux, k); - dump_best(&stack, out, hdr); - fprintf(pysamerr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + if (dump_best(&stack, out, hdr) < 0) goto write_fail; + fprintf(pysam_stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); kh_destroy(pos, q->best_hash); free((char*)kh_key(aux, k)); + kh_del(lib, aux, k); } } kh_destroy(lib, aux); @@ -201,23 +227,46 @@ void bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) kh_destroy(name, del_set); free(stack.a); bam_destroy1(b); + return 0; + + write_fail: + print_error_errno("rmdup", "failed to write record"); + fail: + clear_stack(&stack); + free(stack.a); + if (aux) { + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + kh_destroy(pos, q->best_hash); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + } + if (del_set) { + clear_del_set(del_set); + kh_destroy(name, del_set); + } + bam_destroy1(b); + return 1; } -void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); +int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); static int rmdup_usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Usage: samtools rmdup [-sS] \n\n"); - fprintf(pysamerr, "Option: -s rmdup for SE reads\n"); - fprintf(pysamerr, " -S treat PE reads as SE in rmdup (force -s)\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Usage: samtools rmdup [-sS] \n\n"); + fprintf(pysam_stderr, "Option: -s rmdup for SE reads\n"); + fprintf(pysam_stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); - sam_global_opt_help(pysamerr, "-...."); + sam_global_opt_help(pysam_stderr, "-...."); return 1; } int bam_rmdup(int argc, char *argv[]) { - int c, is_se = 0, force_se = 0; + int c, ret, is_se = 0, force_se = 0; samFile *in, *out; bam_hdr_t *header; char wmode[3] = {'w', 'b', 0}; @@ -241,23 +290,35 @@ int bam_rmdup(int argc, char *argv[]) return rmdup_usage(); in = sam_open_format(argv[optind], "r", &ga.in); + if (!in) { + print_error_errno("rmdup", "failed to open \"%s\" for input", argv[optind]); + return 1; + } header = sam_hdr_read(in); if (header == NULL || header->n_targets == 0) { - fprintf(pysamerr, "[bam_rmdup] input SAM does not have header. Abort!\n"); + fprintf(pysam_stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); return 1; } sam_open_mode(wmode+1, argv[optind+1], NULL); out = sam_open_format(argv[optind+1], wmode, &ga.out); - if (in == 0 || out == 0) { - fprintf(pysamerr, "[bam_rmdup] fail to read/write input files\n"); + if (!out) { + print_error_errno("rmdup", "failed to open \"%s\" for output", argv[optind+1]); + return 1; + } + if (sam_hdr_write(out, header) < 0) { + print_error_errno("rmdup", "failed to write header"); return 1; } - sam_hdr_write(out, header); - if (is_se) bam_rmdupse_core(in, header, out, force_se); - else bam_rmdup_core(in, header, out); + if (is_se) ret = bam_rmdupse_core(in, header, out, force_se); + else ret = bam_rmdup_core(in, header, out); + bam_hdr_destroy(header); - sam_close(in); sam_close(out); - return 0; + sam_close(in); + if (sam_close(out) < 0) { + fprintf(pysam_stderr, "[bam_rmdup] error closing output file\n"); + ret = 1; + } + return ret; } diff --git a/samtools/bam_rmdupse.c b/samtools/bam_rmdupse.c index d17f6f5..f6baef0 100644 --- a/samtools/bam_rmdupse.c +++ b/samtools/bam_rmdupse.c @@ -23,12 +23,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "bam.h" // for bam_get_library #include "htslib/sam.h" #include "htslib/khash.h" #include "htslib/klist.h" +#include "samtools.h" #define QUEUE_CLEAR_SIZE 0x100000 #define MAX_POS 0x7fffffff @@ -93,8 +96,8 @@ static void clear_besthash(besthash_t *h, int32_t pos) kh_del(best, h, k); } -static void dump_alignment(samFile *out, bam_hdr_t *hdr, - queue_t *queue, int32_t pos, khash_t(lib) *h) +static int dump_alignment(samFile *out, bam_hdr_t *hdr, + queue_t *queue, int32_t pos, khash_t(lib) *h) { if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { khint_t k; @@ -108,7 +111,7 @@ static void dump_alignment(samFile *out, bam_hdr_t *hdr, continue; } if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; - sam_write1(out, hdr, q->b); + if (sam_write1(out, hdr, q->b) < 0) return -1; q->b->l_data = 0; kl_shift(q, queue, 0); } @@ -119,28 +122,40 @@ static void dump_alignment(samFile *out, bam_hdr_t *hdr, } } } + return 0; } -void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) +int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) { - bam1_t *b; - queue_t *queue; + bam1_t *b = NULL; + queue_t *queue = NULL; khint_t k; - int last_tid = -2; - khash_t(lib) *aux; + int last_tid = -2, r; + khash_t(lib) *aux = NULL; aux = kh_init(lib); b = bam_init1(); queue = kl_init(q); - while (sam_read1(in, hdr, b) >= 0) { + if (!aux || !b || !queue) { + perror(__func__); + goto fail; + } + + while ((r = sam_read1(in, hdr, b)) >= 0) { bam1_core_t *c = &b->core; int endpos = bam_endpos(b); int score = sum_qual(b); if (last_tid != c->tid) { - if (last_tid >= 0) dump_alignment(out, hdr, queue, MAX_POS, aux); + if (last_tid >= 0) { + if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0) + goto write_fail; + } last_tid = c->tid; - } else dump_alignment(out, hdr, queue, c->pos, aux); + } else { + if (dump_alignment(out, hdr, queue, c->pos, aux) < 0) + goto write_fail; + } if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { push_queue(queue, b, endpos, score); } else { @@ -170,7 +185,12 @@ void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) } else kh_val(h, k) = push_queue(queue, b, endpos, score); } } - dump_alignment(out, hdr, queue, MAX_POS, aux); + if (r < -1) { + fprintf(stderr, "[%s] error reading input file\n", __func__); + goto fail; + } + + if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0) goto write_fail; for (k = kh_begin(aux); k != kh_end(aux); ++k) { if (kh_exist(aux, k)) { @@ -179,9 +199,29 @@ void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); kh_destroy(best, q->left); kh_destroy(best, q->rght); free((char*)kh_key(aux, k)); + kh_del(lib, aux, k); } } kh_destroy(lib, aux); bam_destroy1(b); kl_destroy(q, queue); + return 0; + + write_fail: + print_error_errno("rmdup", "failed to write record"); + fail: + if (aux) { + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + kh_destroy(best, q->left); + kh_destroy(best, q->rght); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + } + bam_destroy1(b); + kl_destroy(q, queue); + return 1; } diff --git a/samtools/bam_rmdupse.c.pysam.c b/samtools/bam_rmdupse.c.pysam.c index 06895a8..3a3d0d0 100644 --- a/samtools/bam_rmdupse.c.pysam.c +++ b/samtools/bam_rmdupse.c.pysam.c @@ -25,12 +25,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "bam.h" // for bam_get_library #include "htslib/sam.h" #include "htslib/khash.h" #include "htslib/klist.h" +#include "samtools.h" #define QUEUE_CLEAR_SIZE 0x100000 #define MAX_POS 0x7fffffff @@ -95,8 +98,8 @@ static void clear_besthash(besthash_t *h, int32_t pos) kh_del(best, h, k); } -static void dump_alignment(samFile *out, bam_hdr_t *hdr, - queue_t *queue, int32_t pos, khash_t(lib) *h) +static int dump_alignment(samFile *out, bam_hdr_t *hdr, + queue_t *queue, int32_t pos, khash_t(lib) *h) { if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { khint_t k; @@ -110,7 +113,7 @@ static void dump_alignment(samFile *out, bam_hdr_t *hdr, continue; } if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; - sam_write1(out, hdr, q->b); + if (sam_write1(out, hdr, q->b) < 0) return -1; q->b->l_data = 0; kl_shift(q, queue, 0); } @@ -121,28 +124,40 @@ static void dump_alignment(samFile *out, bam_hdr_t *hdr, } } } + return 0; } -void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) +int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) { - bam1_t *b; - queue_t *queue; + bam1_t *b = NULL; + queue_t *queue = NULL; khint_t k; - int last_tid = -2; - khash_t(lib) *aux; + int last_tid = -2, r; + khash_t(lib) *aux = NULL; aux = kh_init(lib); b = bam_init1(); queue = kl_init(q); - while (sam_read1(in, hdr, b) >= 0) { + if (!aux || !b || !queue) { + perror(__func__); + goto fail; + } + + while ((r = sam_read1(in, hdr, b)) >= 0) { bam1_core_t *c = &b->core; int endpos = bam_endpos(b); int score = sum_qual(b); if (last_tid != c->tid) { - if (last_tid >= 0) dump_alignment(out, hdr, queue, MAX_POS, aux); + if (last_tid >= 0) { + if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0) + goto write_fail; + } last_tid = c->tid; - } else dump_alignment(out, hdr, queue, c->pos, aux); + } else { + if (dump_alignment(out, hdr, queue, c->pos, aux) < 0) + goto write_fail; + } if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { push_queue(queue, b, endpos, score); } else { @@ -172,18 +187,43 @@ void bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) } else kh_val(h, k) = push_queue(queue, b, endpos, score); } } - dump_alignment(out, hdr, queue, MAX_POS, aux); + if (r < -1) { + fprintf(pysam_stderr, "[%s] error reading input file\n", __func__); + goto fail; + } + + if (dump_alignment(out, hdr, queue, MAX_POS, aux) < 0) goto write_fail; for (k = kh_begin(aux); k != kh_end(aux); ++k) { if (kh_exist(aux, k)) { lib_aux_t *q = &kh_val(aux, k); - fprintf(pysamerr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + fprintf(pysam_stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); kh_destroy(best, q->left); kh_destroy(best, q->rght); free((char*)kh_key(aux, k)); + kh_del(lib, aux, k); } } kh_destroy(lib, aux); bam_destroy1(b); kl_destroy(q, queue); + return 0; + + write_fail: + print_error_errno("rmdup", "failed to write record"); + fail: + if (aux) { + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + kh_destroy(best, q->left); + kh_destroy(best, q->rght); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + } + bam_destroy1(b); + kl_destroy(q, queue); + return 1; } diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 7a441ae..4955dcc 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -1,6 +1,6 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2015 Genome Research Ltd. + Copyright (C) 2008-2016 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -31,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -404,7 +407,7 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, hdr_match_t *new_sq_matches = NULL; char *text; hdr_match_t matches[2]; - int32_t i, missing; + int32_t i; int32_t old_n_targets = merged_hdr->n_targets; khiter_t iter; int min_tid = -1; @@ -502,20 +505,20 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, text += matches[0].rm_eo; } - // Check if any new targets have been missed - missing = 0; + // Copy the @SQ headers found and recreate any missing from binary header. for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { if (new_sq_matches[i].rm_so >= 0) { if (match_to_ks(translate->text, &new_sq_matches[i], out_text)) goto memfail; if (kputc('\n', out_text) == EOF) goto memfail; } else { - fprintf(stderr, "[E::%s] @SQ SN (%s) found in binary header but not text header.\n", - __func__, merged_hdr->target_name[i + old_n_targets]); - missing++; + if (kputs("@SQ\tSN:", out_text) == EOF || + kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF || + kputs("\tLN:", out_text) == EOF || + kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF || + kputc('\n', out_text) == EOF) goto memfail; } } - if (missing) goto fail; free(new_sq_matches); return 0; @@ -775,7 +778,7 @@ static int finish_rg_pg(bool is_rg, klist_t(hdrln) *hdr_lines, static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg, - char* rg_override) + bool copy_co, char* rg_override) { klist_t(hdrln) *rg_list = NULL; klist_t(hdrln) *pg_list = NULL; @@ -817,20 +820,22 @@ static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, kl_destroy(hdrln, rg_list); rg_list = NULL; kl_destroy(hdrln, pg_list); pg_list = NULL; - // Just append @CO headers without translation - const char *line, *end_pointer; - for (line = translate->text; *line; line = end_pointer + 1) { - end_pointer = strchr(line, '\n'); - if (strncmp(line, "@CO", 3) == 0) { - if (end_pointer) { - if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) - goto memfail; - } else { // Last line with no trailing '\n' - if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; - if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; + if (copy_co) { + // Just append @CO headers without translation + const char *line, *end_pointer; + for (line = translate->text; *line; line = end_pointer + 1) { + end_pointer = strchr(line, '\n'); + if (strncmp(line, "@CO", 3) == 0) { + if (end_pointer) { + if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) + goto memfail; + } else { // Last line with no trailing '\n' + if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; + if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; + } } + if (end_pointer == NULL) break; } - if (end_pointer == NULL) break; } return 0; @@ -1036,6 +1041,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) // Create reverse translation table for tids int* rtrans = (int*)malloc(sizeof(int32_t)*n*n_targets); const int32_t NOTID = INT32_MIN; + if (!rtrans) return NULL; memset_pattern4((void*)rtrans, &NOTID, sizeof(int32_t)*n*n_targets); int i; for (i = 0; i < n; ++i) { @@ -1056,6 +1062,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) #define MERGE_FORCE 8 // Overwrite output BAM if it exists #define MERGE_COMBINE_RG 16 // Combine RG tags frather than redefining them #define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them +#define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only) /* * How merging is handled @@ -1101,8 +1108,8 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *reg, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt) { - samFile *fpout, **fp; - heap1_t *heap; + samFile *fpout, **fp = NULL; + heap1_t *heap = NULL; bam_hdr_t *hout = NULL; bam_hdr_t *hin = NULL; int i, j, *RG_len = NULL; @@ -1111,6 +1118,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; + int *rtrans = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; @@ -1127,20 +1135,36 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (hin == NULL) { fprintf(stderr, "[bam_merge_core] couldn't read headers for '%s'\n", headers); - return -1; + goto mem_fail; + } + } else { + hout = bam_hdr_init(); + if (!hout) { + fprintf(stderr, "[bam_merge_core] couldn't allocate bam header\n"); + goto mem_fail; } + hout->text = strdup(""); + if (!hout->text) goto mem_fail; } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); + if (!fp) goto mem_fail; heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + if (!heap) goto mem_fail; iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); + if (!iter) goto mem_fail; hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); + if (!hdr) goto mem_fail; translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); + if (!translation_tbl) goto mem_fail; RG = (char**)calloc(n, sizeof(char*)); + if (!RG) goto mem_fail; + // prepare RG tag from file names if (flag & MERGE_RG) { RG_len = (int*)calloc(n, sizeof(int)); + if (!RG_len) goto mem_fail; for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; @@ -1149,6 +1173,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); + if (!RG[i]) goto mem_fail; RG_len[i] = l; strncpy(RG[i], s + j, l); } @@ -1159,7 +1184,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, trans_tbl_t dummy; int res; res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG, - flag & MERGE_COMBINE_PG, NULL); + flag & MERGE_COMBINE_PG, true, NULL); trans_tbl_destroy(&dummy); if (res) return -1; // FIXME: memory leak } @@ -1169,31 +1194,19 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_hdr_t *hin; fp[i] = sam_open_format(fn[i], "r", in_fmt); if (fp[i] == NULL) { - int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); - for (j = 0; j < i; ++j) { - bam_hdr_destroy(hdr[i]); - sam_close(fp[j]); - } - free(fp); free(heap); - // FIXME: possible memory leak - return -1; + goto fail; } hin = sam_hdr_read(fp[i]); if (hin == NULL) { fprintf(stderr, "[bam_merge_core] failed to read header for '%s'\n", fn[i]); - for (j = 0; j < i; ++j) { - bam_hdr_destroy(hdr[i]); - sam_close(fp[j]); - } - free(fp); free(heap); - // FIXME: possible memory leak - return -1; + goto fail; } if (trans_tbl_init(merged_hdr, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG, + (flag & MERGE_FIRST_CO)? (i == 0) : true, RG[i])) return -1; // FIXME: memory leak @@ -1224,12 +1237,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, // If we're only merging a specified region move our iters to start at that point if (reg) { - int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); - int tid, beg, end; - const char *name_lim = hts_parse_reg(reg, &beg, &end); + const char *name_lim; + + rtrans = rtrans_build(n, hout->n_targets, translation_tbl); + if (!rtrans) goto mem_fail; + + name_lim = hts_parse_reg(reg, &beg, &end); if (name_lim) { char *name = malloc(name_lim - reg + 1); + if (!name) goto mem_fail; memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); @@ -1244,7 +1261,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (tid < 0) { if (name_lim) fprintf(stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); else fprintf(stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); - return -1; + goto fail; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); @@ -1253,7 +1270,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (idx == NULL) { fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", __func__, fn[i]); - return -1; + goto fail; } if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); @@ -1261,47 +1278,70 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); - if (iter[i] == NULL) break; + if (iter[i] == NULL) { + if (mapped_tid != INT32_MIN) { + fprintf(stderr, + "[%s] failed to get iterator over " + "{%s, %d, %d, %d}\n", + __func__, fn[i], mapped_tid, beg, end); + } else { + fprintf(stderr, + "[%s] failed to get iterator over " + "{%s, HTS_IDX_NONE, 0, 0}\n", + __func__, fn[i]); + } + goto fail; + } } free(rtrans); + rtrans = NULL; } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); - if (iter[i] == NULL) break; + if (iter[i] == NULL) { + fprintf(stderr, "[%s] failed to get iterator\n", __func__); + goto fail; + } } else iter[i] = NULL; } } - if (i < n) { - fprintf(stderr, "[%s] Memory allocation failed\n", __func__); - return -1; - } - // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; + int res; h->i = i; h->b = bam_init1(); - if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { + if (!h->b) goto mem_fail; + res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b); + if (res >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } - else { + else if (res == -1 && (!iter[i] || iter[i]->finished)) { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; + } else { + fprintf(stderr, "[%s] failed to read first record from %s\n", + __func__, fn[i]); + goto fail; } } // Open output file and write header if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) { - fprintf(stderr, "[%s] fail to create the output file.\n", __func__); + fprintf(stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno)); + return -1; + } + if (sam_hdr_write(fpout, hout) != 0) { + fprintf(stderr, "[%s] failed to write header.\n", __func__); + sam_close(fpout); return -1; } - sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge @@ -1313,16 +1353,24 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } - sam_write1(fpout, hout, b); + if (sam_write1(fpout, hout, b) < 0) { + fprintf(stderr, "[%s] failed to write to output file.\n", __func__); + sam_close(fpout); + return -1; + } if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; - } else if (j == -1) { + } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; - } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + } else { + fprintf(stderr, "[bam_merge_core] error: '%s' is truncated.\n", + fn[heap->i]); + goto fail; + } ks_heapadjust(heap, 0, n, heap); } @@ -1340,9 +1388,39 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_hdr_destroy(hin); bam_hdr_destroy(hout); free_merged_header(merged_hdr); - sam_close(fpout); free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); + if (sam_close(fpout) < 0) { + fprintf(stderr, "[bam_merge_core] error closing output file\n"); + return -1; + } return 0; + + mem_fail: + fprintf(stderr, "[bam_merge_core] Out of memory\n"); + + fail: + if (flag & MERGE_RG) { + if (RG) { + for (i = 0; i != n; ++i) free(RG[i]); + } + free(RG_len); + } + for (i = 0; i < n; ++i) { + if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i); + if (iter && iter[i]) hts_itr_destroy(iter[i]); + if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); + if (fp && fp[i]) sam_close(fp[i]); + if (heap && heap[i].b) bam_destroy1(heap[i].b); + } + if (hout) bam_hdr_destroy(hout); + free(RG); + free(translation_tbl); + free(hdr); + free(iter); + free(heap); + free(fp); + free(rtrans); + return -1; } // Unused here but may be used by legacy samtools-using third-party code @@ -1361,7 +1439,7 @@ static void merge_usage(FILE *to) "Usage: samtools merge [-nurlf] [-h inh.sam] [-b ] [ ... ]\n" "\n" "Options:\n" -" -n Sort by read names\n" +" -n Input files are sorted by read name\n" " -r Attach RG tag (inferred from file names)\n" " -u Uncompressed BAM output\n" " -f Overwrite the output BAM if exist\n" @@ -1541,29 +1619,40 @@ typedef struct { bam1_p *buf; const bam_hdr_t *h; int index; + int error; } worker_t; -static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) +// Returns 0 for success +// -1 for failure +static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) { size_t i; samFile* fp; fp = sam_open_format(fn, mode, fmt); - if (fp == NULL) return; - sam_hdr_write(fp, h); + if (fp == NULL) return -1; + if (sam_hdr_write(fp, h) != 0) goto fail; if (n_threads > 1) hts_set_threads(fp, n_threads); - for (i = 0; i < l; ++i) - sam_write1(fp, h, buf[i]); + for (i = 0; i < l; ++i) { + if (sam_write1(fp, h, buf[i]) < 0) goto fail; + } + if (sam_close(fp) < 0) return -1; + return 0; + fail: sam_close(fp); + return -1; } static void *worker(void *data) { worker_t *w = (worker_t*)data; char *name; + w->error = 0; ks_mergesort(sort, w->buf_len, w->buf, 0); name = (char*)calloc(strlen(w->prefix) + 20, 1); + if (!name) { w->error = errno; return 0; } sprintf(name, "%s.%.4d.bam", w->prefix, w->index); - write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0, NULL); + if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) + w->error = errno; // Consider using CRAM temporary files if the final output is CRAM. // Typically it is comparable speed while being smaller. @@ -1572,7 +1661,8 @@ static void *worker(void *data) // {"no_ref", CRAM_OPT_NO_REF, {1}, NULL} // }; // opt[0].next = &opt[1]; -// write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt); +// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0) +// w->error = errno; free(name); return 0; @@ -1586,6 +1676,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c pthread_t *tid; pthread_attr_t attr; worker_t *w; + int n_failed = 0; if (n_threads < 1) n_threads = 1; if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records @@ -1603,9 +1694,15 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c b += w[i].buf_len; rest -= w[i].buf_len; pthread_create(&tid[i], &attr, worker, &w[i]); } - for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + for (i = 0; i < n_threads; ++i) { + pthread_join(tid[i], 0); + if (w[i].error != 0) { + fprintf(stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error)); + n_failed++; + } + } free(tid); free(w); - return n_files + n_threads; + return (n_failed == 0)? n_files + n_threads : -1; } /*! @@ -1675,6 +1772,10 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + if (n_files < 0) { + ret = -1; + goto err; + } mem = k = 0; } } @@ -1687,10 +1788,18 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, // write the final output if (n_files == 0) { // a single block ks_mergesort(sort, k, buf, 0); - write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt); + if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { + fprintf(stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno)); + ret = -1; + goto err; + } } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + if (n_files == -1) { + ret = -1; + goto err; + } fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { @@ -1698,8 +1807,8 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, sprintf(fns[i], "%s.%.4d.bam", prefix, i); } if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, - MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads, - in_fmt, out_fmt) < 0) { + MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO, + NULL, n_threads, in_fmt, out_fmt) < 0) { // Propagate bam_merge_core2() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -1754,6 +1863,7 @@ int bam_sort(int argc, char *argv[]) int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; + struct stat st; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -1804,8 +1914,15 @@ int bam_sort(int argc, char *argv[]) sam_open_mode(modeout+1, fnout, NULL); if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9); - if (tmpprefix.l == 0) - ksprintf(&tmpprefix, "%s.tmp", (nargs > 0)? argv[optind] : "STDIN"); + if (tmpprefix.l == 0) { + if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout); + else kputc('.', &tmpprefix); + } + if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) { + unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); + if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix); + ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); + } ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-", tmpprefix.s, fnout, modeout, max_mem, n_threads, diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index d486beb..b2b625d 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -2,7 +2,7 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2015 Genome Research Ltd. + Copyright (C) 2008-2016 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -26,6 +26,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -33,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -406,7 +409,7 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, hdr_match_t *new_sq_matches = NULL; char *text; hdr_match_t matches[2]; - int32_t i, missing; + int32_t i; int32_t old_n_targets = merged_hdr->n_targets; khiter_t iter; int min_tid = -1; @@ -483,7 +486,7 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, if (iter == kh_end(sq_tids)) { // Warn about this, but it's not really fatal. - fprintf(pysamerr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", + fprintf(pysam_stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", __func__, (int) (matches[1].rm_eo - matches[1].rm_so), text + matches[1].rm_so); @@ -504,20 +507,20 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, text += matches[0].rm_eo; } - // Check if any new targets have been missed - missing = 0; + // Copy the @SQ headers found and recreate any missing from binary header. for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { if (new_sq_matches[i].rm_so >= 0) { if (match_to_ks(translate->text, &new_sq_matches[i], out_text)) goto memfail; if (kputc('\n', out_text) == EOF) goto memfail; } else { - fprintf(pysamerr, "[E::%s] @SQ SN (%s) found in binary header but not text header.\n", - __func__, merged_hdr->target_name[i + old_n_targets]); - missing++; + if (kputs("@SQ\tSN:", out_text) == EOF || + kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF || + kputs("\tLN:", out_text) == EOF || + kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF || + kputc('\n', out_text) == EOF) goto memfail; } } - if (missing) goto fail; free(new_sq_matches); return 0; @@ -720,7 +723,7 @@ static int finish_rg_pg(bool is_rg, klist_t(hdrln) *hdr_lines, idx = kh_get(c2c, pg_map, id); if (idx == kh_end(pg_map)) { // Not found, warn. - fprintf(pysamerr, "[W::%s] Tag %s%s not found in @PG records\n", + fprintf(pysam_stderr, "[W::%s] Tag %s%s not found in @PG records\n", __func__, search + 1, id); } else { // Remember new id and splice points on original string @@ -777,7 +780,7 @@ static int finish_rg_pg(bool is_rg, klist_t(hdrln) *hdr_lines, static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg, - char* rg_override) + bool copy_co, char* rg_override) { klist_t(hdrln) *rg_list = NULL; klist_t(hdrln) *pg_list = NULL; @@ -819,20 +822,22 @@ static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, kl_destroy(hdrln, rg_list); rg_list = NULL; kl_destroy(hdrln, pg_list); pg_list = NULL; - // Just append @CO headers without translation - const char *line, *end_pointer; - for (line = translate->text; *line; line = end_pointer + 1) { - end_pointer = strchr(line, '\n'); - if (strncmp(line, "@CO", 3) == 0) { - if (end_pointer) { - if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) - goto memfail; - } else { // Last line with no trailing '\n' - if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; - if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; + if (copy_co) { + // Just append @CO headers without translation + const char *line, *end_pointer; + for (line = translate->text; *line; line = end_pointer + 1) { + end_pointer = strchr(line, '\n'); + if (strncmp(line, "@CO", 3) == 0) { + if (end_pointer) { + if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) + goto memfail; + } else { // Last line with no trailing '\n' + if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; + if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; + } } + if (end_pointer == NULL) break; } - if (end_pointer == NULL) break; } return 0; @@ -869,7 +874,7 @@ static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) { + ks_len(&merged_hdr->out_pg) + ks_len(&merged_hdr->out_co)); if (txt_sz >= INT32_MAX) { - fprintf(pysamerr, "[%s] Output header text too long\n", __func__); + fprintf(pysam_stderr, "[%s] Output header text too long\n", __func__); return NULL; } @@ -986,7 +991,7 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl) } } else { char *tmp = strdup(decoded_rg); - fprintf(pysamerr, + fprintf(pysam_stderr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered " "with no corresponding entry in header, tag lost. " "Unknown tags are only reported once per input file for " @@ -1016,7 +1021,7 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl) } } else { char *tmp = strdup(decoded_pg); - fprintf(pysamerr, + fprintf(pysam_stderr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered " "with no corresponding entry in header, tag lost. " "Unknown tags are only reported once per input file for " @@ -1038,6 +1043,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) // Create reverse translation table for tids int* rtrans = (int*)malloc(sizeof(int32_t)*n*n_targets); const int32_t NOTID = INT32_MIN; + if (!rtrans) return NULL; memset_pattern4((void*)rtrans, &NOTID, sizeof(int32_t)*n*n_targets); int i; for (i = 0; i < n; ++i) { @@ -1058,6 +1064,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) #define MERGE_FORCE 8 // Overwrite output BAM if it exists #define MERGE_COMBINE_RG 16 // Combine RG tags frather than redefining them #define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them +#define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only) /* * How merging is handled @@ -1103,8 +1110,8 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *reg, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt) { - samFile *fpout, **fp; - heap1_t *heap; + samFile *fpout, **fp = NULL; + heap1_t *heap = NULL; bam_hdr_t *hout = NULL; bam_hdr_t *hin = NULL; int i, j, *RG_len = NULL; @@ -1113,6 +1120,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; + int *rtrans = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; @@ -1121,28 +1129,44 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); - fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); + fprintf(pysam_stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hin = sam_hdr_read(fpheaders); sam_close(fpheaders); if (hin == NULL) { - fprintf(pysamerr, "[bam_merge_core] couldn't read headers for '%s'\n", + fprintf(pysam_stderr, "[bam_merge_core] couldn't read headers for '%s'\n", headers); - return -1; + goto mem_fail; + } + } else { + hout = bam_hdr_init(); + if (!hout) { + fprintf(pysam_stderr, "[bam_merge_core] couldn't allocate bam header\n"); + goto mem_fail; } + hout->text = strdup(""); + if (!hout->text) goto mem_fail; } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); + if (!fp) goto mem_fail; heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + if (!heap) goto mem_fail; iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); + if (!iter) goto mem_fail; hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); + if (!hdr) goto mem_fail; translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); + if (!translation_tbl) goto mem_fail; RG = (char**)calloc(n, sizeof(char*)); + if (!RG) goto mem_fail; + // prepare RG tag from file names if (flag & MERGE_RG) { RG_len = (int*)calloc(n, sizeof(int)); + if (!RG_len) goto mem_fail; for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; @@ -1151,6 +1175,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); + if (!RG[i]) goto mem_fail; RG_len[i] = l; strncpy(RG[i], s + j, l); } @@ -1161,7 +1186,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, trans_tbl_t dummy; int res; res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG, - flag & MERGE_COMBINE_PG, NULL); + flag & MERGE_COMBINE_PG, true, NULL); trans_tbl_destroy(&dummy); if (res) return -1; // FIXME: memory leak } @@ -1171,31 +1196,19 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_hdr_t *hin; fp[i] = sam_open_format(fn[i], "r", in_fmt); if (fp[i] == NULL) { - int j; - fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); - for (j = 0; j < i; ++j) { - bam_hdr_destroy(hdr[i]); - sam_close(fp[j]); - } - free(fp); free(heap); - // FIXME: possible memory leak - return -1; + fprintf(pysam_stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); + goto fail; } hin = sam_hdr_read(fp[i]); if (hin == NULL) { - fprintf(pysamerr, "[bam_merge_core] failed to read header for '%s'\n", + fprintf(pysam_stderr, "[bam_merge_core] failed to read header for '%s'\n", fn[i]); - for (j = 0; j < i; ++j) { - bam_hdr_destroy(hdr[i]); - sam_close(fp[j]); - } - free(fp); free(heap); - // FIXME: possible memory leak - return -1; + goto fail; } if (trans_tbl_init(merged_hdr, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG, + (flag & MERGE_FIRST_CO)? (i == 0) : true, RG[i])) return -1; // FIXME: memory leak @@ -1205,13 +1218,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { - fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); + fprintf(pysam_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Did we get an @HD line? if (!merged_hdr->have_hd) { - fprintf(pysamerr, "[W::%s] No @HD tag found.\n", __func__); + fprintf(pysam_stderr, "[W::%s] No @HD tag found.\n", __func__); /* FIXME: Should we add an @HD line here, and if so what should we put in it? Ideally we want a way of getting htslib to tell us the SAM version number to assume given no @HD line. Is @@ -1226,12 +1239,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, // If we're only merging a specified region move our iters to start at that point if (reg) { - int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); - int tid, beg, end; - const char *name_lim = hts_parse_reg(reg, &beg, &end); + const char *name_lim; + + rtrans = rtrans_build(n, hout->n_targets, translation_tbl); + if (!rtrans) goto mem_fail; + + name_lim = hts_parse_reg(reg, &beg, &end); if (name_lim) { char *name = malloc(name_lim - reg + 1); + if (!name) goto mem_fail; memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); @@ -1244,18 +1261,18 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, end = INT_MAX; } if (tid < 0) { - if (name_lim) fprintf(pysamerr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); - else fprintf(pysamerr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); - return -1; + if (name_lim) fprintf(pysam_stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); + else fprintf(pysam_stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); + goto fail; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (idx == NULL) { - fprintf(pysamerr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", + fprintf(pysam_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", __func__, fn[i]); - return -1; + goto fail; } if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); @@ -1263,47 +1280,70 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); - if (iter[i] == NULL) break; + if (iter[i] == NULL) { + if (mapped_tid != INT32_MIN) { + fprintf(pysam_stderr, + "[%s] failed to get iterator over " + "{%s, %d, %d, %d}\n", + __func__, fn[i], mapped_tid, beg, end); + } else { + fprintf(pysam_stderr, + "[%s] failed to get iterator over " + "{%s, HTS_IDX_NONE, 0, 0}\n", + __func__, fn[i]); + } + goto fail; + } } free(rtrans); + rtrans = NULL; } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); - if (iter[i] == NULL) break; + if (iter[i] == NULL) { + fprintf(pysam_stderr, "[%s] failed to get iterator\n", __func__); + goto fail; + } } else iter[i] = NULL; } } - if (i < n) { - fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); - return -1; - } - // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; + int res; h->i = i; h->b = bam_init1(); - if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { + if (!h->b) goto mem_fail; + res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b); + if (res >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } - else { + else if (res == -1 && (!iter[i] || iter[i]->finished)) { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; + } else { + fprintf(pysam_stderr, "[%s] failed to read first record from %s\n", + __func__, fn[i]); + goto fail; } } // Open output file and write header if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) { - fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); + fprintf(pysam_stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno)); + return -1; + } + if (sam_hdr_write(fpout, hout) != 0) { + fprintf(pysam_stderr, "[%s] failed to write header.\n", __func__); + sam_close(fpout); return -1; } - sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge @@ -1315,16 +1355,24 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } - sam_write1(fpout, hout, b); + if (sam_write1(fpout, hout, b) < 0) { + fprintf(pysam_stderr, "[%s] failed to write to output file.\n", __func__); + sam_close(fpout); + return -1; + } if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; - } else if (j == -1) { + } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; - } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + } else { + fprintf(pysam_stderr, "[bam_merge_core] error: '%s' is truncated.\n", + fn[heap->i]); + goto fail; + } ks_heapadjust(heap, 0, n, heap); } @@ -1342,9 +1390,39 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, bam_hdr_destroy(hin); bam_hdr_destroy(hout); free_merged_header(merged_hdr); - sam_close(fpout); free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); + if (sam_close(fpout) < 0) { + fprintf(pysam_stderr, "[bam_merge_core] error closing output file\n"); + return -1; + } return 0; + + mem_fail: + fprintf(pysam_stderr, "[bam_merge_core] Out of memory\n"); + + fail: + if (flag & MERGE_RG) { + if (RG) { + for (i = 0; i != n; ++i) free(RG[i]); + } + free(RG_len); + } + for (i = 0; i < n; ++i) { + if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i); + if (iter && iter[i]) hts_itr_destroy(iter[i]); + if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); + if (fp && fp[i]) sam_close(fp[i]); + if (heap && heap[i].b) bam_destroy1(heap[i].b); + } + if (hout) bam_hdr_destroy(hout); + free(RG); + free(translation_tbl); + free(hdr); + free(iter); + free(heap); + free(fp); + free(rtrans); + return -1; } // Unused here but may be used by legacy samtools-using third-party code @@ -1363,7 +1441,7 @@ static void merge_usage(FILE *to) "Usage: samtools merge [-nurlf] [-h inh.sam] [-b ] [ ... ]\n" "\n" "Options:\n" -" -n Sort by read names\n" +" -n Input files are sorted by read name\n" " -r Attach RG tag (inferred from file names)\n" " -u Uncompressed BAM output\n" " -f Overwrite the output BAM if exist\n" @@ -1396,7 +1474,7 @@ int bam_merge(int argc, char *argv[]) }; if (argc == 1) { - merge_usage(stdout); + merge_usage(pysam_stdout); return 0; } @@ -1426,7 +1504,7 @@ int bam_merge(int argc, char *argv[]) fn_size += nfiles; } else { - fprintf(pysamerr, "[%s] Invalid file list \"%s\"\n", __func__, optarg); + fprintf(pysam_stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg); ret = 1; } break; @@ -1434,12 +1512,12 @@ int bam_merge(int argc, char *argv[]) default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': merge_usage(pysamerr); return 1; + case '?': merge_usage(pysam_stderr); return 1; } } if ( argc - optind < 1 ) { - fprintf(pysamerr, "You must at least specify the output file.\n"); - merge_usage(pysamerr); + fprintf(pysam_stderr, "You must at least specify the output file.\n"); + merge_usage(pysam_stderr); return 1; } @@ -1448,7 +1526,7 @@ int bam_merge(int argc, char *argv[]) FILE *fp = fopen(argv[optind], "rb"); if (fp != NULL) { fclose(fp); - fprintf(pysamerr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); + fprintf(pysam_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); return 1; } } @@ -1461,8 +1539,8 @@ int bam_merge(int argc, char *argv[]) memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); } if (fn_size+nargcfiles < 1) { - fprintf(pysamerr, "You must specify at least one (and usually two or more) input files.\n"); - merge_usage(pysamerr); + fprintf(pysam_stderr, "You must specify at least one (and usually two or more) input files.\n"); + merge_usage(pysam_stderr); return 1; } strcpy(mode, "wb"); @@ -1543,29 +1621,40 @@ typedef struct { bam1_p *buf; const bam_hdr_t *h; int index; + int error; } worker_t; -static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) +// Returns 0 for success +// -1 for failure +static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) { size_t i; samFile* fp; fp = sam_open_format(fn, mode, fmt); - if (fp == NULL) return; - sam_hdr_write(fp, h); + if (fp == NULL) return -1; + if (sam_hdr_write(fp, h) != 0) goto fail; if (n_threads > 1) hts_set_threads(fp, n_threads); - for (i = 0; i < l; ++i) - sam_write1(fp, h, buf[i]); + for (i = 0; i < l; ++i) { + if (sam_write1(fp, h, buf[i]) < 0) goto fail; + } + if (sam_close(fp) < 0) return -1; + return 0; + fail: sam_close(fp); + return -1; } static void *worker(void *data) { worker_t *w = (worker_t*)data; char *name; + w->error = 0; ks_mergesort(sort, w->buf_len, w->buf, 0); name = (char*)calloc(strlen(w->prefix) + 20, 1); + if (!name) { w->error = errno; return 0; } sprintf(name, "%s.%.4d.bam", w->prefix, w->index); - write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0, NULL); + if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) + w->error = errno; // Consider using CRAM temporary files if the final output is CRAM. // Typically it is comparable speed while being smaller. @@ -1574,7 +1663,8 @@ static void *worker(void *data) // {"no_ref", CRAM_OPT_NO_REF, {1}, NULL} // }; // opt[0].next = &opt[1]; -// write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt); +// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0) +// w->error = errno; free(name); return 0; @@ -1588,6 +1678,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c pthread_t *tid; pthread_attr_t attr; worker_t *w; + int n_failed = 0; if (n_threads < 1) n_threads = 1; if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records @@ -1605,9 +1696,15 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c b += w[i].buf_len; rest -= w[i].buf_len; pthread_create(&tid[i], &attr, worker, &w[i]); } - for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + for (i = 0; i < n_threads; ++i) { + pthread_join(tid[i], 0); + if (w[i].error != 0) { + fprintf(pysam_stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error)); + n_failed++; + } + } free(tid); free(w); - return n_files + n_threads; + return (n_failed == 0)? n_files + n_threads : -1; } /*! @@ -1647,12 +1744,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, fp = sam_open_format(fn, "r", in_fmt); if (fp == NULL) { const char *message = strerror(errno); - fprintf(pysamerr, "[bam_sort_core] fail to open '%s': %s\n", fn, message); + fprintf(pysam_stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message); return -2; } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(pysamerr, "[bam_sort_core] failed to read header for '%s'\n", fn); + fprintf(pysam_stderr, "[bam_sort_core] failed to read header for '%s'\n", fn); goto err; } if (is_by_qname) change_SO(header, "queryname"); @@ -1677,11 +1774,15 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + if (n_files < 0) { + ret = -1; + goto err; + } mem = k = 0; } } if (ret != -1) { - fprintf(pysamerr, "[bam_sort_core] truncated file. Aborting.\n"); + fprintf(pysam_stderr, "[bam_sort_core] truncated file. Aborting.\n"); ret = -1; goto err; } @@ -1689,19 +1790,27 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, // write the final output if (n_files == 0) { // a single block ks_mergesort(sort, k, buf, 0); - write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt); + if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { + fprintf(pysam_stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno)); + ret = -1; + goto err; + } } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); - fprintf(pysamerr, "[bam_sort_core] merging from %d files...\n", n_files); + if (n_files == -1) { + ret = -1; + goto err; + } + fprintf(pysam_stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, - MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads, - in_fmt, out_fmt) < 0) { + MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO, + NULL, n_threads, in_fmt, out_fmt) < 0) { // Propagate bam_merge_core2() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -1756,6 +1865,7 @@ int bam_sort(int argc, char *argv[]) int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; + struct stat st; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -1782,22 +1892,22 @@ int bam_sort(int argc, char *argv[]) default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': sort_usage(pysamerr); ret = EXIT_FAILURE; goto sort_end; + case '?': sort_usage(pysam_stderr); ret = EXIT_FAILURE; goto sort_end; } } nargs = argc - optind; if (nargs == 0 && isatty(STDIN_FILENO)) { - sort_usage(stdout); + sort_usage(pysam_stdout); ret = EXIT_SUCCESS; goto sort_end; } else if (nargs >= 2) { // If exactly two, user probably tried to specify legacy if (nargs == 2) - fprintf(pysamerr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n"); + fprintf(pysam_stderr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n"); - sort_usage(pysamerr); + sort_usage(pysam_stderr); ret = EXIT_FAILURE; goto sort_end; } @@ -1806,8 +1916,15 @@ int bam_sort(int argc, char *argv[]) sam_open_mode(modeout+1, fnout, NULL); if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9); - if (tmpprefix.l == 0) - ksprintf(&tmpprefix, "%s.tmp", (nargs > 0)? argv[optind] : "STDIN"); + if (tmpprefix.l == 0) { + if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout); + else kputc('.', &tmpprefix); + } + if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) { + unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); + if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix); + ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); + } ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-", tmpprefix.s, fnout, modeout, max_mem, n_threads, @@ -1819,7 +1936,7 @@ int bam_sort(int argc, char *argv[]) // If we failed on opening the input file & it has no .bam/.cram/etc // extension, the user probably tried legacy -o if (ret == -2 && o_seen && nargs > 0 && sam_open_mode(dummy, argv[optind], NULL) < 0) - fprintf(pysamerr, "[bam_sort] Note the argument has been replaced by -T/-o options\n"); + fprintf(pysam_stderr, "[bam_sort] Note the argument has been replaced by -T/-o options\n"); ret = EXIT_FAILURE; } diff --git a/samtools/bam_split.c b/samtools/bam_split.c index e44acc0..9a2998a 100644 --- a/samtools/bam_split.c +++ b/samtools/bam_split.c @@ -1,6 +1,6 @@ /* bam_split.c -- split subcommand. - Copyright (C) 2013, 2014 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: Martin Pollard @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -55,6 +57,7 @@ struct state { bam_hdr_t* unaccounted_header; size_t output_count; char** rg_id; + char **rg_output_file_name; samFile** rg_output_file; bam_hdr_t** rg_output_header; kh_c2i_t* rg_hash; @@ -62,7 +65,7 @@ struct state { typedef struct state state_t; -static int cleanup_state(state_t* status); +static int cleanup_state(state_t* status, bool check_close); static void cleanup_opts(parsed_opts_t* opts); static void usage(FILE *write_to) @@ -334,7 +337,7 @@ static state_t* init(parsed_opts_t* opts) if (retval->merged_input_header == NULL) { fprintf(stderr, "Could not read header for file '%s'\n", opts->merged_input_name); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } @@ -343,14 +346,14 @@ static state_t* init(parsed_opts_t* opts) samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in); if (!hdr_load) { fprintf(stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } retval->unaccounted_header = sam_hdr_read(hdr_load); if (retval->unaccounted_header == NULL) { fprintf(stderr, "Could not read header for file '%s'\n", opts->unaccounted_header_name); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } sam_close(hdr_load); @@ -361,7 +364,7 @@ static state_t* init(parsed_opts_t* opts) retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); if (retval->unaccounted_file == NULL) { fprintf(stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } } @@ -370,12 +373,13 @@ static state_t* init(parsed_opts_t* opts) if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count); + retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); retval->rg_hash = kh_init_c2i(); - if (!retval->rg_output_file || !retval->rg_output_header) { + if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { fprintf(stderr, "Could not allocate memory for output file array. Out of memory?"); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } @@ -383,7 +387,7 @@ static state_t* init(parsed_opts_t* opts) char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); if (!input_base_name) { fprintf(stderr, "Out of memory\n"); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } char* extension = strrchr(input_base_name, '.'); @@ -399,16 +403,17 @@ static state_t* init(parsed_opts_t* opts) &opts->ga.out); if ( output_filename == NULL ) { - fprintf(stderr, "Error expanding output filename format string.\r\n"); - cleanup_state(retval); + fprintf(stderr, "Error expanding output filename format string.\n"); + cleanup_state(retval, false); free(input_base_name); return NULL; } + retval->rg_output_file_name[i] = output_filename; retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out); if (retval->rg_output_file[i] == NULL) { - fprintf(stderr, "Could not open output file: %s\r\n", output_filename); - cleanup_state(retval); + fprintf(stderr, "Could not open output file: %s\n", output_filename); + cleanup_state(retval, false); free(input_base_name); return NULL; } @@ -421,13 +426,11 @@ static state_t* init(parsed_opts_t* opts) // Set and edit header retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) { - fprintf(stderr, "Could not rewrite header for file: %s\r\n", output_filename); - cleanup_state(retval); - free(output_filename); + fprintf(stderr, "Could not rewrite header for file: %s\n", output_filename); + cleanup_state(retval, false); free(input_base_name); return NULL; } - free(output_filename); } free(input_base_name); @@ -444,7 +447,8 @@ static bool split(state_t* state) size_t i; for (i = 0; i < state->output_count; i++) { if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { - fprintf(stderr, "Could not write output file header\n"); + fprintf(stderr, "Could not write output file header for '%s'\n", + state->rg_output_file_name[i]); return false; } } @@ -457,7 +461,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(stderr, "Could not write read sequence\n"); + fprintf(stderr, "Could not read first input record\n"); return false; } } @@ -478,7 +482,9 @@ static bool split(state_t* state) // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) { - fprintf(stderr, "Could not write sequence\n"); + fprintf(stderr, "Could not write to output file '%s'\n", + state->rg_output_file_name[i]); + bam_destroy1(file_read); return false; } } else { @@ -493,7 +499,8 @@ static bool split(state_t* state) return false; } else { if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) { - fprintf(stderr, "Could not write sequence\n"); + fprintf(stderr, "Could not write to unaccounted output file\n"); + bam_destroy1(file_read); return false; } } @@ -505,7 +512,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(stderr, "Could not write read sequence\n"); + fprintf(stderr, "Could not read input record\n"); return false; } } @@ -514,23 +521,38 @@ static bool split(state_t* state) return true; } -static int cleanup_state(state_t* status) +static int cleanup_state(state_t* status, bool check_close) { int ret = 0; if (!status) return 0; if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); - if (status->unaccounted_file) ret |= sam_close(status->unaccounted_file); + if (status->unaccounted_file) { + if (sam_close(status->unaccounted_file) < 0 && check_close) { + fprintf(stderr, "Error on closing unaccounted file\n"); + ret = -1; + } + } sam_close(status->merged_input_file); size_t i; for (i = 0; i < status->output_count; i++) { - bam_hdr_destroy(status->rg_output_header[i]); - ret |= sam_close(status->rg_output_file[i]); - free(status->rg_id[i]); + if (status->rg_output_header && status->rg_output_header[i]) + bam_hdr_destroy(status->rg_output_header[i]); + if (status->rg_output_file && status->rg_output_file[i]) { + if (sam_close(status->rg_output_file[i]) < 0 && check_close) { + fprintf(stderr, "Error on closing output file '%s'\n", + status->rg_output_file_name[i]); + ret = -1; + } + } + if (status->rg_id) free(status->rg_id[i]); + if (status->rg_output_file_name) free(status->rg_output_file_name[i]); } - bam_hdr_destroy(status->merged_input_header); + if (status->merged_input_header) + bam_hdr_destroy(status->merged_input_header); free(status->rg_output_header); free(status->rg_output_file); + free(status->rg_output_file_name); kh_destroy_c2i(status->rg_hash); free(status->rg_id); free(status); @@ -553,13 +575,17 @@ int main_split(int argc, char** argv) { int ret = 1; parsed_opts_t* opts = parse_args(argc, argv); - if (!opts ) goto cleanup_opts; + if (!opts) goto cleanup_opts; state_t* status = init(opts); if (!status) goto cleanup_opts; - if (split(status)) ret = 0; + if (!split(status)) { + cleanup_state(status, false); + goto cleanup_opts; + } + + ret = cleanup_state(status, true); - ret |= (cleanup_state(status) != 0); cleanup_opts: cleanup_opts(opts); diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c index 329556f..2348f48 100644 --- a/samtools/bam_split.c.pysam.c +++ b/samtools/bam_split.c.pysam.c @@ -2,7 +2,7 @@ /* bam_split.c -- split subcommand. - Copyright (C) 2013, 2014 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: Martin Pollard @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -57,6 +59,7 @@ struct state { bam_hdr_t* unaccounted_header; size_t output_count; char** rg_id; + char **rg_output_file_name; samFile** rg_output_file; bam_hdr_t** rg_output_header; kh_c2i_t* rg_hash; @@ -64,7 +67,7 @@ struct state { typedef struct state state_t; -static int cleanup_state(state_t* status); +static int cleanup_state(state_t* status, bool check_close); static void cleanup_opts(parsed_opts_t* opts); static void usage(FILE *write_to) @@ -92,7 +95,7 @@ static void usage(FILE *write_to) // Takes the command line options and turns them into something we can understand static parsed_opts_t* parse_args(int argc, char** argv) { - if (argc == 1) { usage(stdout); return NULL; } + if (argc == 1) { usage(pysam_stdout); return NULL; } const char* optstring = "vf:u:"; char* delim; @@ -130,7 +133,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break; /* else fall-through */ case '?': - usage(stdout); + usage(pysam_stdout); free(retval); return NULL; } @@ -142,8 +145,8 @@ static parsed_opts_t* parse_args(int argc, char** argv) argv += optind; if (argc != 1) { - fprintf(pysamerr, "Invalid number of arguments: %d\n", argc); - usage(pysamerr); + fprintf(pysam_stderr, "Invalid number of arguments: %d\n", argc); + usage(pysam_stderr); free(retval); return NULL; } @@ -184,11 +187,11 @@ static char* expand_format_string(const char* format_string, const char* basenam kputs("bam", &str); break; case '\0': - // Error is: fprintf(pysamerr, "bad format string, trailing %%\n"); + // Error is: fprintf(pysam_stderr, "bad format string, trailing %%\n"); free(str.s); return NULL; default: - // Error is: fprintf(pysamerr, "bad format string, unknown format specifier\n"); + // Error is: fprintf(pysam_stderr, "bad format string, unknown format specifier\n"); free(str.s); return NULL; } @@ -322,21 +325,21 @@ static state_t* init(parsed_opts_t* opts) { state_t* retval = calloc(sizeof(state_t), 1); if (!retval) { - fprintf(pysamerr, "Out of memory"); + fprintf(pysam_stderr, "Out of memory"); return NULL; } retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); if (!retval->merged_input_file) { - fprintf(pysamerr, "Could not open input file (%s)\n", opts->merged_input_name); + fprintf(pysam_stderr, "Could not open input file (%s)\n", opts->merged_input_name); free(retval); return NULL; } retval->merged_input_header = sam_hdr_read(retval->merged_input_file); if (retval->merged_input_header == NULL) { - fprintf(pysamerr, "Could not read header for file '%s'\n", + fprintf(pysam_stderr, "Could not read header for file '%s'\n", opts->merged_input_name); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } @@ -344,15 +347,15 @@ static state_t* init(parsed_opts_t* opts) if (opts->unaccounted_header_name) { samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in); if (!hdr_load) { - fprintf(pysamerr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name); - cleanup_state(retval); + fprintf(pysam_stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name); + cleanup_state(retval, false); return NULL; } retval->unaccounted_header = sam_hdr_read(hdr_load); if (retval->unaccounted_header == NULL) { - fprintf(pysamerr, "Could not read header for file '%s'\n", + fprintf(pysam_stderr, "Could not read header for file '%s'\n", opts->unaccounted_header_name); - cleanup_state(retval); + cleanup_state(retval, false); return NULL; } sam_close(hdr_load); @@ -362,30 +365,31 @@ static state_t* init(parsed_opts_t* opts) retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); if (retval->unaccounted_file == NULL) { - fprintf(pysamerr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name); - cleanup_state(retval); + fprintf(pysam_stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name); + cleanup_state(retval, false); return NULL; } } // Open output files for RGs if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; - if (opts->verbose) fprintf(pysamerr, "@RG's found %zu\n",retval->output_count); + if (opts->verbose) fprintf(pysam_stderr, "@RG's found %zu\n",retval->output_count); + retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); retval->rg_hash = kh_init_c2i(); - if (!retval->rg_output_file || !retval->rg_output_header) { - fprintf(pysamerr, "Could not allocate memory for output file array. Out of memory?"); - cleanup_state(retval); + if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { + fprintf(pysam_stderr, "Could not allocate memory for output file array. Out of memory?"); + cleanup_state(retval, false); return NULL; } char* dirsep = strrchr(opts->merged_input_name, '/'); char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); if (!input_base_name) { - fprintf(pysamerr, "Out of memory\n"); - cleanup_state(retval); + fprintf(pysam_stderr, "Out of memory\n"); + cleanup_state(retval, false); return NULL; } char* extension = strrchr(input_base_name, '.'); @@ -401,16 +405,17 @@ static state_t* init(parsed_opts_t* opts) &opts->ga.out); if ( output_filename == NULL ) { - fprintf(pysamerr, "Error expanding output filename format string.\r\n"); - cleanup_state(retval); + fprintf(pysam_stderr, "Error expanding output filename format string.\n"); + cleanup_state(retval, false); free(input_base_name); return NULL; } + retval->rg_output_file_name[i] = output_filename; retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out); if (retval->rg_output_file[i] == NULL) { - fprintf(pysamerr, "Could not open output file: %s\r\n", output_filename); - cleanup_state(retval); + fprintf(pysam_stderr, "Could not open output file: %s\n", output_filename); + cleanup_state(retval, false); free(input_base_name); return NULL; } @@ -423,13 +428,11 @@ static state_t* init(parsed_opts_t* opts) // Set and edit header retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) { - fprintf(pysamerr, "Could not rewrite header for file: %s\r\n", output_filename); - cleanup_state(retval); - free(output_filename); + fprintf(pysam_stderr, "Could not rewrite header for file: %s\n", output_filename); + cleanup_state(retval, false); free(input_base_name); return NULL; } - free(output_filename); } free(input_base_name); @@ -440,13 +443,14 @@ static state_t* init(parsed_opts_t* opts) static bool split(state_t* state) { if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { - fprintf(pysamerr, "Could not write output file header\n"); + fprintf(pysam_stderr, "Could not write output file header\n"); return false; } size_t i; for (i = 0; i < state->output_count; i++) { if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { - fprintf(pysamerr, "Could not write output file header\n"); + fprintf(pysam_stderr, "Could not write output file header for '%s'\n", + state->rg_output_file_name[i]); return false; } } @@ -459,7 +463,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(pysamerr, "Could not write read sequence\n"); + fprintf(pysam_stderr, "Could not read first input record\n"); return false; } } @@ -480,22 +484,25 @@ static bool split(state_t* state) // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) { - fprintf(pysamerr, "Could not write sequence\n"); + fprintf(pysam_stderr, "Could not write to output file '%s'\n", + state->rg_output_file_name[i]); + bam_destroy1(file_read); return false; } } else { // otherwise write to the unaccounted bam if there is one or fail if (state->unaccounted_file == NULL) { if (tag) { - fprintf(pysamerr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); + fprintf(pysam_stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); } else { - fprintf(pysamerr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); + fprintf(pysam_stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); } bam_destroy1(file_read); return false; } else { if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) { - fprintf(pysamerr, "Could not write sequence\n"); + fprintf(pysam_stderr, "Could not write to unaccounted output file\n"); + bam_destroy1(file_read); return false; } } @@ -507,7 +514,7 @@ static bool split(state_t* state) bam_destroy1(file_read); file_read = NULL; if (r < -1) { - fprintf(pysamerr, "Could not write read sequence\n"); + fprintf(pysam_stderr, "Could not read input record\n"); return false; } } @@ -516,23 +523,38 @@ static bool split(state_t* state) return true; } -static int cleanup_state(state_t* status) +static int cleanup_state(state_t* status, bool check_close) { int ret = 0; if (!status) return 0; if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); - if (status->unaccounted_file) ret |= sam_close(status->unaccounted_file); + if (status->unaccounted_file) { + if (sam_close(status->unaccounted_file) < 0 && check_close) { + fprintf(pysam_stderr, "Error on closing unaccounted file\n"); + ret = -1; + } + } sam_close(status->merged_input_file); size_t i; for (i = 0; i < status->output_count; i++) { - bam_hdr_destroy(status->rg_output_header[i]); - ret |= sam_close(status->rg_output_file[i]); - free(status->rg_id[i]); + if (status->rg_output_header && status->rg_output_header[i]) + bam_hdr_destroy(status->rg_output_header[i]); + if (status->rg_output_file && status->rg_output_file[i]) { + if (sam_close(status->rg_output_file[i]) < 0 && check_close) { + fprintf(pysam_stderr, "Error on closing output file '%s'\n", + status->rg_output_file_name[i]); + ret = -1; + } + } + if (status->rg_id) free(status->rg_id[i]); + if (status->rg_output_file_name) free(status->rg_output_file_name[i]); } - bam_hdr_destroy(status->merged_input_header); + if (status->merged_input_header) + bam_hdr_destroy(status->merged_input_header); free(status->rg_output_header); free(status->rg_output_file); + free(status->rg_output_file_name); kh_destroy_c2i(status->rg_hash); free(status->rg_id); free(status); @@ -555,13 +577,17 @@ int main_split(int argc, char** argv) { int ret = 1; parsed_opts_t* opts = parse_args(argc, argv); - if (!opts ) goto cleanup_opts; + if (!opts) goto cleanup_opts; state_t* status = init(opts); if (!status) goto cleanup_opts; - if (split(status)) ret = 0; + if (!split(status)) { + cleanup_state(status, false); + goto cleanup_opts; + } + + ret = cleanup_state(status, true); - ret |= (cleanup_state(status) != 0); cleanup_opts: cleanup_opts(opts); diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c index 5cb3235..f6cf1d5 100644 --- a/samtools/bam_stat.c +++ b/samtools/bam_stat.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c index a519312..cdca4dd 100644 --- a/samtools/bam_stat.c.pysam.c +++ b/samtools/bam_stat.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -81,7 +83,7 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) flagstat_loop(s, c); bam_destroy1(b); if (ret != -1) - fprintf(pysamerr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + fprintf(pysam_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); return s; } @@ -120,16 +122,16 @@ int bam_flagstat(int argc, char *argv[]) switch (c) { case INPUT_FMT_OPTION: if (hts_opt_add(&in_opts, optarg) < 0) - usage_exit(pysamerr, EXIT_FAILURE); + usage_exit(pysam_stderr, EXIT_FAILURE); break; default: - usage_exit(pysamerr, EXIT_FAILURE); + usage_exit(pysam_stderr, EXIT_FAILURE); } } if (argc != optind+1) { - if (argc == optind) usage_exit(stdout, EXIT_SUCCESS); - else usage_exit(pysamerr, EXIT_FAILURE); + if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS); + else usage_exit(pysam_stderr, EXIT_FAILURE); } fp = sam_open(argv[optind], "r"); if (fp == NULL) { @@ -137,40 +139,40 @@ int bam_flagstat(int argc, char *argv[]) return 1; } if (hts_opt_apply(fp, in_opts)) { - fprintf(pysamerr, "Failed to apply input-fmt-options\n"); + fprintf(pysam_stderr, "Failed to apply input-fmt-options\n"); return 1; } if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(pysamerr, "Failed to read header for \"%s\"\n", argv[optind]); + fprintf(pysam_stderr, "Failed to read header for \"%s\"\n", argv[optind]); return 1; } s = bam_flagstat_core(fp, header); - printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); - printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); - printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); - printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); - printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); - printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); - printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); - printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); - printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); - printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); - printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); - printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); - printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); + fprintf(pysam_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + fprintf(pysam_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); + fprintf(pysam_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); + fprintf(pysam_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + fprintf(pysam_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + fprintf(pysam_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); + fprintf(pysam_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); + fprintf(pysam_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); + fprintf(pysam_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); + fprintf(pysam_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); + fprintf(pysam_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); + fprintf(pysam_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); + fprintf(pysam_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_hdr_destroy(header); sam_close(fp); diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c index f86ae43..f1f0cc7 100644 --- a/samtools/bam_tview.c +++ b/samtools/bam_tview.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "bam_tview.h" diff --git a/samtools/bam_tview.c.pysam.c b/samtools/bam_tview.c.pysam.c index 736b588..a47bced 100644 --- a/samtools/bam_tview.c.pysam.c +++ b/samtools/bam_tview.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "bam_tview.h" @@ -68,7 +70,7 @@ int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, tv->fp = sam_open_format(fn, "r", fmt); if(tv->fp == NULL) { - fprintf(pysamerr,"sam_open %s. %s\n", fn,fn_fa); + fprintf(pysam_stderr,"sam_open %s. %s\n", fn,fn_fa); exit(EXIT_FAILURE); } // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024); @@ -77,13 +79,13 @@ int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, tv->header = sam_hdr_read(tv->fp); if(tv->header == NULL) { - fprintf(pysamerr,"Cannot read '%s'.\n", fn); + fprintf(pysam_stderr,"Cannot read '%s'.\n", fn); exit(EXIT_FAILURE); } tv->idx = sam_index_load(tv->fp, fn); if (tv->idx == NULL) { - fprintf(pysamerr,"Cannot read index for '%s'.\n", fn); + fprintf(pysam_stderr,"Cannot read index for '%s'.\n", fn); exit(EXIT_FAILURE); } tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); @@ -297,7 +299,7 @@ int base_draw_aln(tview_t *tv, int tid, int pos) free(str); if ( !tv->ref ) { - fprintf(pysamerr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n"); + fprintf(pysam_stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n"); exit(1); } } @@ -326,19 +328,19 @@ static void error(const char *format, ...) { if ( !format ) { - fprintf(pysamerr, + fprintf(pysam_stderr, "Usage: samtools tview [options] [ref.fasta]\n" "Options:\n" " -d display output as (H)tml or (C)urses or (T)ext \n" " -p chr:pos go directly to this position\n" " -s STR display only reads from this sample or group\n"); - sam_global_opt_help(pysamerr, "-.--."); + sam_global_opt_help(pysam_stderr, "-.--."); } else { va_list ap; va_start(ap, format); - vfprintf(pysamerr, format, ap); + vfprintf(pysam_stderr, format, ap); va_end(ap); } exit(-1); @@ -428,7 +430,7 @@ int bam_tview_main(int argc, char *argv[]) } if ( i==tv->header->n_targets ) { - fprintf(pysamerr,"None of the BAM sequence names present in the fasta file\n"); + fprintf(pysam_stderr,"None of the BAM sequence names present in the fasta file\n"); exit(EXIT_FAILURE); } tv->curr_tid = i; diff --git a/samtools/bam_tview_curses.c.pysam.c b/samtools/bam_tview_curses.c.pysam.c index bbeedf8..90a8335 100644 --- a/samtools/bam_tview_curses.c.pysam.c +++ b/samtools/bam_tview_curses.c.pysam.c @@ -304,7 +304,7 @@ tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples, tview_t* base=(tview_t*)tv; if(tv==0) { - fprintf(pysamerr,"Calloc failed\n"); + fprintf(pysam_stderr,"Calloc failed\n"); return 0; } diff --git a/samtools/bam_tview_html.c b/samtools/bam_tview_html.c index 9db8fce..e3aecda 100644 --- a/samtools/bam_tview_html.c +++ b/samtools/bam_tview_html.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "bam_tview.h" diff --git a/samtools/bam_tview_html.c.pysam.c b/samtools/bam_tview_html.c.pysam.c index b42c737..164e33d 100644 --- a/samtools/bam_tview_html.c.pysam.c +++ b/samtools/bam_tview_html.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "bam_tview.h" @@ -183,7 +185,7 @@ static int html_drawaln(struct AbstractTview* tv, int tid, int pos) fprintf(ptr->out,"1) fprintf(pysamerr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes); + //if(y>1) fprintf(pysam_stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes); if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) { @@ -322,12 +324,12 @@ tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples, tview_t* base=(tview_t*)tv; if(tv==0) { - fprintf(pysamerr,"Calloc failed\n"); + fprintf(pysam_stderr,"Calloc failed\n"); return 0; } tv->row_count=0; tv->screen=NULL; - tv->out=stdout; + tv->out=pysam_stdout; tv->attributes=0; base_tv_init(base,fn,fn_fa,samples,fmt); /* initialize callbacks */ diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c index ac97bb8..044bc4e 100644 --- a/samtools/bamshuf.c +++ b/samtools/bamshuf.c @@ -1,7 +1,7 @@ /* bamshuf.c -- collate subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013, 2015 Genome Research Ltd. Author: Heng Li @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -77,14 +79,16 @@ KSORT_INIT(bamshuf, elem_t, elem_lt) static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout, sam_global_args *ga) { - samFile *fp, *fpw, **fpt; - char **fnt, modew[8]; - bam1_t *b; - int i, l; - bam_hdr_t *h; - int64_t *cnt; + samFile *fp, *fpw = NULL, **fpt = NULL; + char **fnt = NULL, modew[8]; + bam1_t *b = NULL; + int i, l, r; + bam_hdr_t *h = NULL; + int64_t j, max_cnt = 0, *cnt = NULL; + elem_t *a = NULL; - // split + // Read input, distribute reads pseudo-randomly into n_files temporary + // files. fp = sam_open_format(fn, "r", &ga->in); if (fp == NULL) { print_error_errno("collate", "Cannot open input file \"%s\"", fn); @@ -94,39 +98,69 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, h = sam_hdr_read(fp); if (h == NULL) { fprintf(stderr, "Couldn't read header for '%s'\n", fn); - return 1; + goto fail; } fnt = (char**)calloc(n_files, sizeof(char*)); + if (!fnt) goto mem_fail; fpt = (samFile**)calloc(n_files, sizeof(samFile*)); + if (!fpt) goto mem_fail; cnt = (int64_t*)calloc(n_files, 8); + if (!cnt) goto mem_fail; + l = strlen(pre); for (i = 0; i < n_files; ++i) { fnt[i] = (char*)calloc(l + 10, 1); + if (!fnt[i]) goto mem_fail; sprintf(fnt[i], "%s.%.4d.bam", pre, i); fpt[i] = sam_open(fnt[i], "wb1"); if (fpt[i] == NULL) { print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]); - return 1; + goto fail; + } + if (sam_hdr_write(fpt[i], h) < 0) { + print_error_errno("collate", "Couldn't write header to intermediate file \"%s\"", fnt[i]); + goto fail; } - sam_hdr_write(fpt[i], h); } b = bam_init1(); - while (sam_read1(fp, h, b) >= 0) { + if (!b) goto mem_fail; + while ((r = sam_read1(fp, h, b)) >= 0) { uint32_t x; x = hash_X31_Wang(bam_get_qname(b)) % n_files; - sam_write1(fpt[x], h, b); + if (sam_write1(fpt[x], h, b) < 0) { + print_error_errno("collate", "Couldn't write to intermediate file \"%s\"", fnt[x]); + goto fail; + } ++cnt[x]; } bam_destroy1(b); - for (i = 0; i < n_files; ++i) sam_close(fpt[i]); + b = NULL; + if (r < -1) { + fprintf(stderr, "Error reading input file\n"); + goto fail; + } + for (i = 0; i < n_files; ++i) { + // Close split output + r = sam_close(fpt[i]); + fpt[i] = NULL; + if (r < 0) { + fprintf(stderr, "Error on closing '%s'\n", fnt[i]); + return 1; + } + + // Find biggest count + if (max_cnt < cnt[i]) max_cnt = cnt[i]; + } free(fpt); + fpt = NULL; sam_close(fp); - + fp = NULL; // merge sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); if (!is_stdout) { // output to a file char *fnw = (char*)calloc(l + 5, 1); + if (!fnw) goto mem_fail; if (ga->out.format == unknown_format) sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default else @@ -137,37 +171,86 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, if (fpw == NULL) { if (is_stdout) print_error_errno("collate", "Cannot open standard output"); else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre); - return 1; + goto fail; + } + + if (sam_hdr_write(fpw, h) < 0) { + print_error_errno("collate", "Couldn't write header"); + goto fail; + } + + a = malloc(max_cnt * sizeof(elem_t)); + if (!a) goto mem_fail; + for (j = 0; j < max_cnt; ++j) { + a[j].b = bam_init1(); + if (!a[j].b) { max_cnt = j; goto mem_fail; } } - sam_hdr_write(fpw, h); for (i = 0; i < n_files; ++i) { - int64_t j, c = cnt[i]; - elem_t *a; + int64_t c = cnt[i]; fp = sam_open_format(fnt[i], "r", &ga->in); - bam_hdr_destroy(sam_hdr_read(fp)); - a = (elem_t*)calloc(c, sizeof(elem_t)); + if (NULL == fp) { + print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]); + goto fail; + } + bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header + + // Slurp in one of the split files for (j = 0; j < c; ++j) { - a[j].b = bam_init1(); - sam_read1(fp, h, a[j].b); + if (sam_read1(fp, h, a[j].b) < 0) { + fprintf(stderr, "Error reading '%s'\n", fnt[i]); + goto fail; + } a[j].key = hash_X31_Wang(bam_get_qname(a[j].b)); } sam_close(fp); unlink(fnt[i]); free(fnt[i]); - ks_introsort(bamshuf, c, a); + fnt[i] = NULL; + + ks_introsort(bamshuf, c, a); // Shuffle all the reads + + // Write them out again for (j = 0; j < c; ++j) { - sam_write1(fpw, h, a[j].b); - bam_destroy1(a[j].b); + if (sam_write1(fpw, h, a[j].b) < 0) { + print_error_errno("collate", "Error writing to output"); + goto fail; + } } - free(a); } - sam_close(fpw); + bam_hdr_destroy(h); - free(fnt); free(cnt); + for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); + free(a); free(fnt); free(cnt); sam_global_args_free(ga); + if (sam_close(fpw) < 0) { + fprintf(stderr, "Error on closing output\n"); + return 1; + } return 0; + + mem_fail: + fprintf(stderr, "Out of memory\n"); + + fail: + if (fp) sam_close(fp); + if (fpw) sam_close(fpw); + if (h) bam_hdr_destroy(h); + if (b) bam_destroy1(b); + for (i = 0; i < n_files; ++i) { + if (fnt) free(fnt[i]); + if (fpt && fpt[i]) sam_close(fpt[i]); + } + if (a) { + for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); + free(a); + } + free(fnt); + free(fpt); + free(cnt); + sam_global_args_free(ga); + return 1; } static int usage(FILE *fp, int n_files) { diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c index d17cf9b..fb1a5ac 100644 --- a/samtools/bamshuf.c.pysam.c +++ b/samtools/bamshuf.c.pysam.c @@ -3,7 +3,7 @@ /* bamshuf.c -- collate subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013, 2015 Genome Research Ltd. Author: Heng Li @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -77,16 +79,18 @@ static inline int elem_lt(elem_t x, elem_t y) KSORT_INIT(bamshuf, elem_t, elem_lt) static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, - int is_stdout, sam_global_args *ga) + int is_pysam_stdout, sam_global_args *ga) { - samFile *fp, *fpw, **fpt; - char **fnt, modew[8]; - bam1_t *b; - int i, l; - bam_hdr_t *h; - int64_t *cnt; + samFile *fp, *fpw = NULL, **fpt = NULL; + char **fnt = NULL, modew[8]; + bam1_t *b = NULL; + int i, l, r; + bam_hdr_t *h = NULL; + int64_t j, max_cnt = 0, *cnt = NULL; + elem_t *a = NULL; - // split + // Read input, distribute reads pseudo-randomly into n_files temporary + // files. fp = sam_open_format(fn, "r", &ga->in); if (fp == NULL) { print_error_errno("collate", "Cannot open input file \"%s\"", fn); @@ -95,88 +99,167 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, h = sam_hdr_read(fp); if (h == NULL) { - fprintf(pysamerr, "Couldn't read header for '%s'\n", fn); - return 1; + fprintf(pysam_stderr, "Couldn't read header for '%s'\n", fn); + goto fail; } fnt = (char**)calloc(n_files, sizeof(char*)); + if (!fnt) goto mem_fail; fpt = (samFile**)calloc(n_files, sizeof(samFile*)); + if (!fpt) goto mem_fail; cnt = (int64_t*)calloc(n_files, 8); + if (!cnt) goto mem_fail; + l = strlen(pre); for (i = 0; i < n_files; ++i) { fnt[i] = (char*)calloc(l + 10, 1); + if (!fnt[i]) goto mem_fail; sprintf(fnt[i], "%s.%.4d.bam", pre, i); fpt[i] = sam_open(fnt[i], "wb1"); if (fpt[i] == NULL) { print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]); - return 1; + goto fail; + } + if (sam_hdr_write(fpt[i], h) < 0) { + print_error_errno("collate", "Couldn't write header to intermediate file \"%s\"", fnt[i]); + goto fail; } - sam_hdr_write(fpt[i], h); } b = bam_init1(); - while (sam_read1(fp, h, b) >= 0) { + if (!b) goto mem_fail; + while ((r = sam_read1(fp, h, b)) >= 0) { uint32_t x; x = hash_X31_Wang(bam_get_qname(b)) % n_files; - sam_write1(fpt[x], h, b); + if (sam_write1(fpt[x], h, b) < 0) { + print_error_errno("collate", "Couldn't write to intermediate file \"%s\"", fnt[x]); + goto fail; + } ++cnt[x]; } bam_destroy1(b); - for (i = 0; i < n_files; ++i) sam_close(fpt[i]); + b = NULL; + if (r < -1) { + fprintf(pysam_stderr, "Error reading input file\n"); + goto fail; + } + for (i = 0; i < n_files; ++i) { + // Close split output + r = sam_close(fpt[i]); + fpt[i] = NULL; + if (r < 0) { + fprintf(pysam_stderr, "Error on closing '%s'\n", fnt[i]); + return 1; + } + + // Find biggest count + if (max_cnt < cnt[i]) max_cnt = cnt[i]; + } free(fpt); + fpt = NULL; sam_close(fp); - + fp = NULL; // merge sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); - if (!is_stdout) { // output to a file + if (!is_pysam_stdout) { // output to a file char *fnw = (char*)calloc(l + 5, 1); + if (!fnw) goto mem_fail; if (ga->out.format == unknown_format) sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default else sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out)); fpw = sam_open_format(fnw, modew, &ga->out); free(fnw); - } else fpw = sam_open_format("-", modew, &ga->out); // output to stdout + } else fpw = sam_open_format("-", modew, &ga->out); // output to pysam_stdout if (fpw == NULL) { - if (is_stdout) print_error_errno("collate", "Cannot open standard output"); + if (is_pysam_stdout) print_error_errno("collate", "Cannot open standard output"); else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre); - return 1; + goto fail; + } + + if (sam_hdr_write(fpw, h) < 0) { + print_error_errno("collate", "Couldn't write header"); + goto fail; + } + + a = malloc(max_cnt * sizeof(elem_t)); + if (!a) goto mem_fail; + for (j = 0; j < max_cnt; ++j) { + a[j].b = bam_init1(); + if (!a[j].b) { max_cnt = j; goto mem_fail; } } - sam_hdr_write(fpw, h); for (i = 0; i < n_files; ++i) { - int64_t j, c = cnt[i]; - elem_t *a; + int64_t c = cnt[i]; fp = sam_open_format(fnt[i], "r", &ga->in); - bam_hdr_destroy(sam_hdr_read(fp)); - a = (elem_t*)calloc(c, sizeof(elem_t)); + if (NULL == fp) { + print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]); + goto fail; + } + bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header + + // Slurp in one of the split files for (j = 0; j < c; ++j) { - a[j].b = bam_init1(); - sam_read1(fp, h, a[j].b); + if (sam_read1(fp, h, a[j].b) < 0) { + fprintf(pysam_stderr, "Error reading '%s'\n", fnt[i]); + goto fail; + } a[j].key = hash_X31_Wang(bam_get_qname(a[j].b)); } sam_close(fp); unlink(fnt[i]); free(fnt[i]); - ks_introsort(bamshuf, c, a); + fnt[i] = NULL; + + ks_introsort(bamshuf, c, a); // Shuffle all the reads + + // Write them out again for (j = 0; j < c; ++j) { - sam_write1(fpw, h, a[j].b); - bam_destroy1(a[j].b); + if (sam_write1(fpw, h, a[j].b) < 0) { + print_error_errno("collate", "Error writing to output"); + goto fail; + } } - free(a); } - sam_close(fpw); + bam_hdr_destroy(h); - free(fnt); free(cnt); + for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); + free(a); free(fnt); free(cnt); sam_global_args_free(ga); + if (sam_close(fpw) < 0) { + fprintf(pysam_stderr, "Error on closing output\n"); + return 1; + } return 0; + + mem_fail: + fprintf(pysam_stderr, "Out of memory\n"); + + fail: + if (fp) sam_close(fp); + if (fpw) sam_close(fpw); + if (h) bam_hdr_destroy(h); + if (b) bam_destroy1(b); + for (i = 0; i < n_files; ++i) { + if (fnt) free(fnt[i]); + if (fpt && fpt[i]) sam_close(fpt[i]); + } + if (a) { + for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); + free(a); + } + free(fnt); + free(fpt); + free(cnt); + sam_global_args_free(ga); + return 1; } static int usage(FILE *fp, int n_files) { fprintf(fp, "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] \n\n" "Options:\n" - " -O output to stdout\n" + " -O output to pysam_stdout\n" " -u uncompressed BAM output\n" " -l INT compression level [%d]\n" // DEF_CLEVEL " -n INT number of temporary files [%d]\n", // n_files @@ -189,7 +272,7 @@ static int usage(FILE *fp, int n_files) { int main_bamshuf(int argc, char *argv[]) { - int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0; + int c, n_files = 64, clevel = DEF_CLEVEL, is_pysam_stdout = 0, is_un = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), @@ -201,15 +284,15 @@ int main_bamshuf(int argc, char *argv[]) case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; case 'u': is_un = 1; break; - case 'O': is_stdout = 1; break; + case 'O': is_pysam_stdout = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': return usage(pysamerr, n_files); + case '?': return usage(pysam_stderr, n_files); } } if (is_un) clevel = 0; if (optind + 2 > argc) - return usage(pysamerr, n_files); + return usage(pysam_stderr, n_files); - return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout, &ga); + return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_pysam_stdout, &ga); } diff --git a/samtools/bamtk.c b/samtools/bamtk.c index 4b4df77..5c1c60d 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2015 Genome Research Ltd. + Copyright (C) 2008-2016 Genome Research Ltd. Author: Heng Li @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -213,7 +215,7 @@ int main(int argc, char *argv[]) printf( "samtools %s\n" "Using htslib %s\n" -"Copyright (C) 2015 Genome Research Ltd.\n", +"Copyright (C) 2016 Genome Research Ltd.\n", samtools_version(), hts_version()); } else if (strcmp(argv[1], "--version-only") == 0) { diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index a369810..1f3d938 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -2,7 +2,7 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2015 Genome Research Ltd. + Copyright (C) 2008-2016 Genome Research Ltd. Author: Heng Li @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -71,13 +73,13 @@ const char *samtools_version() static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) { - fflush(stdout); - if (subcommand && *subcommand) fprintf(pysamerr, "samtools %s: ", subcommand); - else fprintf(pysamerr, "samtools: "); - vfprintf(pysamerr, format, args); - if (extra) fprintf(pysamerr, ": %s\n", extra); - else fprintf(pysamerr, "\n"); - fflush(pysamerr); + fflush(pysam_stdout); + if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand); + else fprintf(pysam_stderr, "samtools: "); + vfprintf(pysam_stderr, format, args); + if (extra) fprintf(pysam_stderr, ": %s\n", extra); + else fprintf(pysam_stderr, "\n"); + fflush(pysam_stderr); } void print_error(const char *subcommand, const char *format, ...) @@ -158,14 +160,13 @@ static void usage(FILE *fp) int samtools_main(int argc, char *argv[]) { #ifdef _WIN32 - setmode(fileno(stdout), O_BINARY); + setmode(fileno(pysam_stdout), O_BINARY); setmode(fileno(stdin), O_BINARY); #endif + if (argc < 2) { usage(pysam_stderr); return 1; } - if (argc < 2) { usage(pysamerr); return 1; } - if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) { - if (argc == 2) { usage(stdout); return 0; } + if (argc == 2) { usage(pysam_stdout); return 0; } // Otherwise change "samtools help COMMAND [...]" to "samtools COMMAND"; // main_xyz() functions by convention display the subcommand's usage @@ -173,6 +174,7 @@ int samtools_main(int argc, char *argv[]) argv++; argc = 2; } + int ret = 0; if (strcmp(argv[1], "view") == 0) ret = main_samview(argc-1, argv+1); else if (strcmp(argv[1], "import") == 0) ret = main_import(argc-1, argv+1); @@ -207,22 +209,22 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1); else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { - fprintf(pysamerr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); + fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; } else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "--version") == 0) { - printf( + fprintf(pysam_stdout, "samtools %s\n" "Using htslib %s\n" -"Copyright (C) 2015 Genome Research Ltd.\n", +"Copyright (C) 2016 Genome Research Ltd.\n", samtools_version(), hts_version()); } else if (strcmp(argv[1], "--version-only") == 0) { - printf("%s+htslib-%s\n", samtools_version(), hts_version()); + fprintf(pysam_stdout, "%s+htslib-%s\n", samtools_version(), hts_version()); } else { - fprintf(pysamerr, "[main] unrecognized command '%s'\n", argv[1]); + fprintf(pysam_stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } return ret; diff --git a/samtools/bedcov.c b/samtools/bedcov.c index e2f0db8..d4dceee 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index 6faa7bf..25fdffc 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -88,9 +90,9 @@ int main_bedcov(int argc, char *argv[]) if (usage) break; } if (usage || optind + 2 > argc) { - fprintf(pysamerr, "Usage: samtools bedcov [options] [...]\n\n"); - fprintf(pysamerr, " -Q INT Only count bases of at least INT quality [0]\n"); - sam_global_opt_help(pysamerr, "-.--."); + fprintf(pysam_stderr, "Usage: samtools bedcov [options] [...]\n\n"); + fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n"); + sam_global_opt_help(pysam_stderr, "-.--."); return 1; } memset(&str, 0, sizeof(kstring_t)); @@ -104,13 +106,13 @@ int main_bedcov(int argc, char *argv[]) if (aux[i]->fp) idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { - fprintf(pysamerr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); + fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); if (aux[i]->header == NULL) { - fprintf(pysamerr, "ERROR: failed to read header for '%s'\n", + fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n", argv[i+optind+1]); return 2; } @@ -153,12 +155,12 @@ int main_bedcov(int argc, char *argv[]) kputc('\t', &str); kputl(cnt[i], &str); } - puts(str.s); + fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout); bam_mplp_destroy(mplp); continue; bed_error: - fprintf(pysamerr, "Errors in BED line '%s'\n", str.s); + fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); diff --git a/samtools/bedidx.c b/samtools/bedidx.c index 627783e..c1954ad 100644 --- a/samtools/bedidx.c +++ b/samtools/bedidx.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c index 716aee5..5b7df0c 100644 --- a/samtools/bedidx.c.pysam.c +++ b/samtools/bedidx.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -199,7 +201,7 @@ void *bed_read(const char *fn) // has called their reference "browser" or "track". if (0 == strcmp(ref, "browser")) continue; if (0 == strcmp(ref, "track")) continue; - fprintf(pysamerr, "[bed_read] Parse error reading %s at line %u\n", + fprintf(pysam_stderr, "[bed_read] Parse error reading %s at line %u\n", fn, line); goto fail_no_msg; } @@ -236,7 +238,7 @@ void *bed_read(const char *fn) bed_index(h); return h; fail: - fprintf(pysamerr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno)); + fprintf(pysam_stderr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno)); fail_no_msg: if (ks) ks_destroy(ks); if (fp) gzclose(fp); diff --git a/samtools/cut_target.c b/samtools/cut_target.c index 56ec9f9..71a6c85 100644 --- a/samtools/cut_target.c +++ b/samtools/cut_target.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c index 92b15a0..82a4c4c 100644 --- a/samtools/cut_target.c.pysam.c +++ b/samtools/cut_target.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -126,18 +128,18 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { if (s >= 0) { int j; - printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); + fprintf(pysam_stdout, "%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); for (j = s; j < i; ++j) { int c = cns[j]>>8; - if (c == 0) putchar('N'); - else putchar("ACGT"[c&3]); + if (c == 0) fputc('N', pysam_stdout); + else fputc("ACGT"[c&3], pysam_stdout); } - putchar('\t'); + fputc('\t', pysam_stdout); for (j = s; j < i; ++j) - putchar(33 + (cns[j]>>8>>2)); - putchar('\n'); + fputc(33 + (cns[j]>>8>>2), pysam_stdout); + fputc('\n', pysam_stdout); } - //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); + //if (s >= 0) fprintf(pysam_stdout, "%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); s = -1; } else if ((b[i]>>2&3) && s < 0) s = i; } @@ -197,18 +199,18 @@ int main_cut_target(int argc, char *argv[]) } if (ga.reference) { g.fai = fai_load(ga.reference); - if (g.fai == 0) fprintf(pysamerr, "[%s] fail to load the fasta index.\n", __func__); + if (g.fai == 0) fprintf(pysam_stderr, "[%s] fail to load the fasta index.\n", __func__); } if (usage || argc == optind) { - fprintf(pysamerr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); - sam_global_opt_help(pysamerr, "-.--f"); + fprintf(pysam_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); + sam_global_opt_help(pysam_stderr, "-.--f"); return 1; } l = max_l = 0; cns = 0; g.fp = sam_open_format(argv[optind], "r", &ga.in); g.h = sam_hdr_read(g.fp); if (g.h == NULL) { - fprintf(pysamerr, "Couldn't read header for '%s'\n", argv[optind]); + fprintf(pysam_stderr, "Couldn't read header for '%s'\n", argv[optind]); sam_close(g.fp); return 1; } diff --git a/samtools/dict.c b/samtools/dict.c index 241d119..fa64a16 100644 --- a/samtools/dict.c +++ b/samtools/dict.c @@ -22,7 +22,10 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include +#include #include #include #include "htslib/kseq.h" @@ -140,7 +143,7 @@ int dict_main(int argc, char *argv[]) char *fname = NULL; if ( optind>=argc ) { - if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + if ( !isatty(STDIN_FILENO) ) fname = "-"; // reading from stdin else return dict_usage(); } else fname = argv[optind]; diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c index 6b4a25a..5368851 100644 --- a/samtools/dict.c.pysam.c +++ b/samtools/dict.c.pysam.c @@ -24,7 +24,10 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include +#include #include #include #include "htslib/kseq.h" @@ -51,14 +54,14 @@ static void write_dict(const char *fn, args_t *args) fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { - fprintf(pysamerr, "dict: %s: No such file or directory\n", fn); + fprintf(pysam_stderr, "dict: %s: No such file or directory\n", fn); exit(1); } - FILE *out = stdout; + FILE *out = pysam_stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { - fprintf(pysamerr, "dict: %s: Cannot open file for writing\n", args->output_fname); + fprintf(pysam_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); exit(1); } } @@ -97,15 +100,15 @@ static void write_dict(const char *fn, args_t *args) static int dict_usage(void) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "About: Create a sequence dictionary file from a fasta file\n"); - fprintf(pysamerr, "Usage: samtools dict [options] \n\n"); - fprintf(pysamerr, "Options: -a, --assembly STR assembly\n"); - fprintf(pysamerr, " -H, --no-header do not print @HD line\n"); - fprintf(pysamerr, " -o, --output STR file to write out dict file [stdout]\n"); - fprintf(pysamerr, " -s, --species STR species\n"); - fprintf(pysamerr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Create a sequence dictionary file from a fasta file\n"); + fprintf(pysam_stderr, "Usage: samtools dict [options] \n\n"); + fprintf(pysam_stderr, "Options: -a, --assembly STR assembly\n"); + fprintf(pysam_stderr, " -H, --no-header do not print @HD line\n"); + fprintf(pysam_stderr, " -o, --output STR file to write out dict file [pysam_stdout]\n"); + fprintf(pysam_stderr, " -s, --species STR species\n"); + fprintf(pysam_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); + fprintf(pysam_stderr, "\n"); return 1; } @@ -142,7 +145,7 @@ int dict_main(int argc, char *argv[]) char *fname = NULL; if ( optind>=argc ) { - if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + if ( !isatty(STDIN_FILENO) ) fname = "-"; // reading from stdin else return dict_usage(); } else fname = argv[optind]; diff --git a/samtools/errmod.c b/samtools/errmod.c index f8b5aa7..c37c6d1 100644 --- a/samtools/errmod.c +++ b/samtools/errmod.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "errmod.h" #include "htslib/ksort.h" diff --git a/samtools/errmod.c.pysam.c b/samtools/errmod.c.pysam.c index fce3042..12176cf 100644 --- a/samtools/errmod.c.pysam.c +++ b/samtools/errmod.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "errmod.h" #include "htslib/ksort.h" diff --git a/samtools/faidx.c b/samtools/faidx.c index dcc1041..336bde5 100644 --- a/samtools/faidx.c +++ b/samtools/faidx.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -67,7 +69,9 @@ int faidx_main(int argc, char *argv[]) error(NULL); if ( argc==2 ) { - fai_build(argv[optind]); + if (fai_build(argv[optind]) != 0) { + error("Could not build fai index %s.fai\n", argv[optind]); + } return 0; } diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c index 971db3b..ac06647 100644 --- a/samtools/faidx.c.pysam.c +++ b/samtools/faidx.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -40,14 +42,14 @@ static void error(const char *format, ...) { va_list ap; va_start(ap, format); - vfprintf(pysamerr, format, ap); + vfprintf(pysam_stderr, format, ap); va_end(ap); } else { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Usage: samtools faidx [ [...]]\n"); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Usage: samtools faidx [ [...]]\n"); + fprintf(pysam_stderr, "\n"); } exit(-1); } @@ -69,7 +71,9 @@ int faidx_main(int argc, char *argv[]) error(NULL); if ( argc==2 ) { - fai_build(argv[optind]); + if (fai_build(argv[optind]) != 0) { + error("Could not build fai index %s.fai\n", argv[optind]); + } return 0; } @@ -78,15 +82,15 @@ int faidx_main(int argc, char *argv[]) while ( ++optind%s\n", argv[optind]); + fprintf(pysam_stdout, ">%s\n", argv[optind]); int i, j, seq_len; char *seq = fai_fetch(fai, argv[optind], &seq_len); if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]); for (i=0; i + #include #include #include diff --git a/samtools/kprobaln.c.pysam.c b/samtools/kprobaln.c.pysam.c index 63dad4c..630b730 100644 --- a/samtools/kprobaln.c.pysam.c +++ b/samtools/kprobaln.c.pysam.c @@ -25,6 +25,8 @@ SOFTWARE. */ +#include + #include #include #include @@ -144,7 +146,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(pysamerr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG +// fprintf(pysam_stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG } // rescale s[i] = sum; @@ -199,7 +201,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; -// fprintf(pysamerr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG +// fprintf(pysam_stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG } // rescale set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; @@ -236,7 +238,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer if (state) state[i-1] = max_k; if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; #ifdef _MAIN - fprintf(pysamerr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2, + fprintf(pysam_stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2, "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG #endif } @@ -250,7 +252,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer #ifdef _MAIN #include -int main(int argc, char *argv[]) +int samtools_kprobaln_main(int argc, char *argv[]) { uint8_t conv[256], *iqual, *ref, *query; int c, l_ref, l_query, i, q = 30, b = 10, P; @@ -261,7 +263,7 @@ int main(int argc, char *argv[]) } } if (optind + 2 > argc) { - fprintf(pysamerr, "Usage: %s [-q %d] [-b %d] \n", argv[0], q, b); // example: acttc attc + fprintf(pysam_stderr, "Usage: %s [-q %d] [-b %d] \n", argv[0], q, b); // example: acttc attc return 1; } memset(conv, 4, 256); @@ -275,7 +277,7 @@ int main(int argc, char *argv[]) memset(iqual, q, l_query); kpa_par_def.bw = b; P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0); - fprintf(pysamerr, "%d\n", P); + fprintf(pysam_stderr, "%d\n", P); free(iqual); return 0; } diff --git a/samtools/misc/ace2sam.c b/samtools/misc/ace2sam.c index 24b6933..77b9993 100644 --- a/samtools/misc/ace2sam.c +++ b/samtools/misc/ace2sam.c @@ -23,6 +23,8 @@ SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/misc/ace2sam.c.pysam.c b/samtools/misc/ace2sam.c.pysam.c index a7f92e2..a663399 100644 --- a/samtools/misc/ace2sam.c.pysam.c +++ b/samtools/misc/ace2sam.c.pysam.c @@ -25,6 +25,8 @@ SOFTWARE. */ +#include + #include #include #include @@ -49,7 +51,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) // a fatal error static void fatal(const char *msg) { - fprintf(pysamerr, "E %s\n", msg); + fprintf(pysam_stderr, "E %s\n", msg); exit(1); } // remove pads @@ -64,7 +66,7 @@ static void remove_pads(const kstring_t *src, kstring_t *dst) dst->l = j; } -int main(int argc, char *argv[]) +int samtools_ace2sam_main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; @@ -80,13 +82,13 @@ int main(int argc, char *argv[]) } } if (argc == optind) { - fprintf(pysamerr, "\nUsage: ace2sam [-pc] \n\n"); - fprintf(pysamerr, "Options: -p output padded SAM\n"); - fprintf(pysamerr, " -c write the contig sequence in SAM\n\n"); - fprintf(pysamerr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); - fprintf(pysamerr, " 2. The order of reads in AF and in RD must be identical\n"); - fprintf(pysamerr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); - fprintf(pysamerr, " 4. This program writes the headerless SAM to stdout and header to pysamerr\n\n"); + fprintf(pysam_stderr, "\nUsage: ace2sam [-pc] \n\n"); + fprintf(pysam_stderr, "Options: -p output padded SAM\n"); + fprintf(pysam_stderr, " -c write the contig sequence in SAM\n\n"); + fprintf(pysam_stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); + fprintf(pysam_stderr, " 2. The order of reads in AF and in RD must be identical\n"); + fprintf(pysam_stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); + fprintf(pysam_stderr, " 4. This program writes the headerless SAM to pysam_stdout and header to pysam_stderr\n\n"); return 1; } @@ -111,14 +113,14 @@ int main(int argc, char *argv[]) if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences - fprintf(pysamerr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line + fprintf(pysam_stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; - fprintf(pysamerr, "S >%s\n", t[0].s); + fprintf(pysam_stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { - fputs("S ", pysamerr); + fputs("S ", pysam_stderr); for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) - fputc(cns->s[i + k], pysamerr); - fputc('\n', pysamerr); + fputc(cns->s[i + k], pysam_stderr); + fputc('\n', pysam_stderr); } #define __padded2cigar(sp) do { \ @@ -152,7 +154,7 @@ int main(int argc, char *argv[]) if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*" for (i = 0; i < t[2].l; ++i) { // read the consensus quality int q; - if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(pysamerr, "E truncated contig quality\n"); + if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(pysam_stderr, "E truncated contig quality\n"); if (s.l) { q = atoi(s.s) + 33; if (q > 126) q = 126; @@ -161,12 +163,12 @@ int main(int argc, char *argv[]) } if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); ks_getuntil(ks, '\n', &s, &dret); // skip the empty line - if (write_cns) puts(t[4].s); t[4].l = 0; + if (write_cns) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); t[4].l = 0; } else if (strcmp(s.s, "AF") == 0) { // padded read position int reversed, neg, pos; if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); if (write_cns) { - if (t[4].l) puts(t[4].s); + if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); t[4].l = 0; } ks_getuntil(ks, 0, &s, &dret); // read name @@ -239,7 +241,7 @@ int main(int argc, char *argv[]) kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ kputs("\t*", &t[4]); // QUAL - puts(t[4].s); // print to stdout + fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); // print to pysam_stdout ++af_i; } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); } diff --git a/samtools/padding.c b/samtools/padding.c index 436d716..cea79cf 100644 --- a/samtools/padding.c +++ b/samtools/padding.c @@ -1,7 +1,7 @@ /* padding.c -- depad subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2014, 2015 Genome Research Ltd. + Copyright (C) 2014-2016 Genome Research Ltd. Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. Author: Heng Li @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -32,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "sam_header.h" #include "sam_opts.h" +#include "samtools.h" #define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5) @@ -191,6 +194,10 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) int ret = 0, n2 = 0, m2 = 0, *posmap = 0; b = bam_init1(); + if (!b) { + fprintf(stderr, "[depad] Couldn't allocate bam struct\n"); + return -1; + } r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; int read_ret; while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in' @@ -357,7 +364,10 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b)); next_seq: - sam_write1(out, h, b); + if (sam_write1(out, h, b) < 0) { + print_error_errno("depad", "error writing to output"); + return -1; + } } if (read_ret < -1) { fprintf(stderr, "[depad] truncated file.\n"); @@ -525,7 +535,7 @@ int main_pad2unpad(int argc, char *argv[]) } // open file handlers if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) { - fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); + print_error_errno("depad", "failed to open \"%s\" for reading", argv[optind]); ret = 1; goto depad_end; } @@ -548,7 +558,7 @@ int main_pad2unpad(int argc, char *argv[]) char wmode[2]; strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b"); if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { - fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + print_error_errno("depad", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); ret = 1; goto depad_end; } @@ -565,14 +575,17 @@ int main_pad2unpad(int argc, char *argv[]) } // Do the depad - ret = bam_pad2unpad(in, out, h, fai); + if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1; depad_end: // close files, free and return if (fai) fai_destroy(fai); if (h) bam_hdr_destroy(h); - sam_close(in); - sam_close(out); + if (in) sam_close(in); + if (out && sam_close(out) < 0) { + fprintf(stderr, "[depad] error on closing output file.\n"); + ret = 1; + } free(fn_list); free(fn_out); return ret; } @@ -593,12 +606,13 @@ static int usage(int is_long_help) sam_global_opt_help(stderr, "-...-"); if (is_long_help) - fprintf(stderr, "Notes:\n\ -\n\ - 1. Requires embedded reference sequences (before the reads for that reference),\n\ - or ideally a FASTA file of the padded reference sequences (via the -T argument).\n\ -\n\ - 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\ -\n"); + fprintf(stderr, +"Notes:\n" +"\n" +"1. Requires embedded reference sequences (before the reads for that reference),\n" +" or ideally a FASTA file of the padded reference sequences (via a -T option).\n" +"\n" +"2. Input padded alignment reads' CIGAR strings must not use P or I operators.\n" +"\n"); return 1; } diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c index fd889f3..9f85c95 100644 --- a/samtools/padding.c.pysam.c +++ b/samtools/padding.c.pysam.c @@ -3,7 +3,7 @@ /* padding.c -- depad subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2014, 2015 Genome Research Ltd. + Copyright (C) 2014-2016 Genome Research Ltd. Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. Author: Heng Li @@ -26,6 +26,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -34,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "sam_header.h" #include "sam_opts.h" +#include "samtools.h" #define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5) @@ -96,10 +99,10 @@ static int unpad_seq(bam1_t *b, kstring_t *s) for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; - fprintf(pysamerr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); + fprintf(pysam_stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { - fprintf(pysamerr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); + fprintf(pysam_stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } @@ -114,7 +117,7 @@ int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); if (fai_ref_len != ref_len) { - fprintf(pysamerr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); + fprintf(pysam_stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); free(fai_ref); return -1; } @@ -128,7 +131,7 @@ int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) } else { int i = seq_nt16_table[(int)base]; if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 - fprintf(pysamerr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); + fprintf(pysam_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); free(fai_ref); return -1; } @@ -149,19 +152,19 @@ int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); if (fai_ref_len != padded_len) { - fprintf(pysamerr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); + fprintf(pysam_stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); free(fai_ref); return -1; } for (k = 0; k < padded_len; ++k) { - //fprintf(pysamerr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); + //fprintf(pysam_stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); base = fai_ref[k]; if (base == '-' || base == '*') { gaps += 1; } else { int i = seq_nt16_table[(int)base]; if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 - fprintf(pysamerr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); + fprintf(pysam_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); free(fai_ref); return -1; } @@ -193,6 +196,10 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) int ret = 0, n2 = 0, m2 = 0, *posmap = 0; b = bam_init1(); + if (!b) { + fprintf(pysam_stderr, "[depad] Couldn't allocate bam struct\n"); + return -1; + } r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; int read_ret; while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in' @@ -203,20 +210,20 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) uint32_t *cigar = bam_get_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { - // fprintf(pysamerr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); + // fprintf(pysam_stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); r_tid = b->core.tid; if (0!=unpad_seq(b, &r)) { - fprintf(pysamerr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); + fprintf(pysam_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); return -1; }; if (h->target_len[r_tid] != r.l) { - fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); + fprintf(pysam_stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { // Check the embedded reference matches the FASTA file if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { - fprintf(pysamerr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); + fprintf(pysam_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); return -1; } assert(r.l == q.l); @@ -224,7 +231,7 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) for (i = 0; i < r.l; ++i) { if (r.s[i] != q.s[i]) { // Show gaps as ASCII 45 - fprintf(pysamerr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", + fprintf(pysam_stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", h->target_name[b->core.tid], i+1, r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); @@ -238,25 +245,25 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { - fprintf(pysamerr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); + fprintf(pysam_stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); return -1; } else if (b->core.tid == r_tid) { ; // good case, reference available - //fprintf(pysamerr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); + //fprintf(pysam_stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); } else if (fai) { if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { - fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); r_tid = b->core.tid; - // fprintf(pysamerr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); + // fprintf(pysam_stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); } else { - fprintf(pysamerr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); + fprintf(pysam_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } if (0!=unpad_seq(b, &q)) { - fprintf(pysamerr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); + fprintf(pysam_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); return -1; }; if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { @@ -325,32 +332,32 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; if (b->core.mtid < 0 || b->core.mpos < 0) { /* Nice case, no mate to worry about*/ - // fprintf(pysamerr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); + // fprintf(pysam_stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); /* TODO - Warning if FLAG says mate should be mapped? */ /* Clean up funny input where mate position is given but mate reference is missing: */ b->core.mtid = -1; b->core.mpos = -1; } else if (b->core.mtid == b->core.tid) { /* Nice case, same reference */ - // fprintf(pysamerr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); + // fprintf(pysam_stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); b->core.mpos = posmap[b->core.mpos]; } else { /* Nasty case, Must load alternative posmap */ - // fprintf(pysamerr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); + // fprintf(pysam_stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); if (!fai) { - fprintf(pysamerr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); + fprintf(pysam_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); return -1; } /* Temporarily load the other reference sequence */ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { - fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); + fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); return -1; } posmap = update_posmap(posmap, r); b->core.mpos = posmap[b->core.mpos]; /* Restore the reference and posmap*/ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { - fprintf(pysamerr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); @@ -359,10 +366,13 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b)); next_seq: - sam_write1(out, h, b); + if (sam_write1(out, h, b) < 0) { + print_error_errno("depad", "error writing to output"); + return -1; + } } if (read_ret < -1) { - fprintf(pysamerr, "[depad] truncated file.\n"); + fprintf(pysam_stderr, "[depad] truncated file.\n"); ret = 1; } free(r.s); free(q.s); free(posmap); @@ -379,10 +389,10 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) for (i = 0; i < old->n_targets; ++i) { unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); if (unpadded_len < 0) { - fprintf(pysamerr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); + fprintf(pysam_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); } else { header->target_len[i] = unpadded_len; - //fprintf(pysamerr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); + //fprintf(pysam_stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); } } /* Duplicating the header allocated new buffer for header string */ @@ -404,7 +414,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) char *name = strstr(text, "\tSN:"); char *name_end; if (!name) { - fprintf(pysamerr, "Unable to find SN: header field\n"); + fprintf(pysam_stderr, "Unable to find SN: header field\n"); return NULL; } name += 4; @@ -458,7 +468,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) /* Check we didn't overflow the buffer */ assert (strlen(header->text) <= strlen(old->text)); if (strlen(header->text) < header->l_text) { - //fprintf(pysamerr, "[depad] Reallocating header buffer\n"); + //fprintf(pysam_stderr, "[depad] Reallocating header buffer\n"); assert (newtext == header->text); newtext = malloc(strlen(header->text) + 1); strcpy(newtext, header->text); @@ -466,7 +476,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) header->text = newtext; header->l_text = strlen(newtext); } - //fprintf(pysamerr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); + //fprintf(pysam_stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); return header; } @@ -507,7 +517,7 @@ int main_pad2unpad(int argc, char *argv[]) break; case '?': is_long_help = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(pysamerr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); + fprintf(pysam_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); return usage(is_long_help); } } @@ -527,30 +537,30 @@ int main_pad2unpad(int argc, char *argv[]) } // open file handlers if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) { - fprintf(pysamerr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); + print_error_errno("depad", "failed to open \"%s\" for reading", argv[optind]); ret = 1; goto depad_end; } if (fn_list && hts_set_fai_filename(in, fn_list) != 0) { - fprintf(pysamerr, "[depad] failed to load reference file \"%s\".\n", fn_list); + fprintf(pysam_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); ret = 1; goto depad_end; } if ((h = sam_hdr_read(in)) == 0) { - fprintf(pysamerr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); + fprintf(pysam_stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); ret = 1; goto depad_end; } if (fai) { h_fix = fix_header(h, fai); } else { - fprintf(pysamerr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); + fprintf(pysam_stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); h_fix = h; } char wmode[2]; strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b"); if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { - fprintf(pysamerr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + print_error_errno("depad", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); ret = 1; goto depad_end; } @@ -561,46 +571,50 @@ int main_pad2unpad(int argc, char *argv[]) hts_set_opt(out, CRAM_OPT_NO_REF, 1); if (sam_hdr_write(out, h_fix) != 0) { - fprintf(pysamerr, "[depad] failed to write header.\n"); + fprintf(pysam_stderr, "[depad] failed to write header.\n"); ret = 1; goto depad_end; } // Do the depad - ret = bam_pad2unpad(in, out, h, fai); + if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1; depad_end: // close files, free and return if (fai) fai_destroy(fai); if (h) bam_hdr_destroy(h); - sam_close(in); - sam_close(out); + if (in) sam_close(in); + if (out && sam_close(out) < 0) { + fprintf(pysam_stderr, "[depad] error on closing output file.\n"); + ret = 1; + } free(fn_list); free(fn_out); return ret; } static int usage(int is_long_help) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Usage: samtools depad \n\n"); - fprintf(pysamerr, "Options:\n"); - fprintf(pysamerr, " -s Output is SAM (default is BAM)\n"); - fprintf(pysamerr, " -S Input is SAM (default is BAM)\n"); - fprintf(pysamerr, " -u Uncompressed BAM output (can't use with -s)\n"); - fprintf(pysamerr, " -1 Fast compression BAM output (can't use with -s)\n"); - fprintf(pysamerr, " -T, --reference FILE\n"); - fprintf(pysamerr, " Padded reference sequence file [null]\n"); - fprintf(pysamerr, " -o FILE Output file name [stdout]\n"); - fprintf(pysamerr, " -? Longer help\n"); - sam_global_opt_help(pysamerr, "-...-"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Usage: samtools depad \n\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -s Output is SAM (default is BAM)\n"); + fprintf(pysam_stderr, " -S Input is SAM (default is BAM)\n"); + fprintf(pysam_stderr, " -u Uncompressed BAM output (can't use with -s)\n"); + fprintf(pysam_stderr, " -1 Fast compression BAM output (can't use with -s)\n"); + fprintf(pysam_stderr, " -T, --reference FILE\n"); + fprintf(pysam_stderr, " Padded reference sequence file [null]\n"); + fprintf(pysam_stderr, " -o FILE Output file name [pysam_stdout]\n"); + fprintf(pysam_stderr, " -? Longer help\n"); + sam_global_opt_help(pysam_stderr, "-...-"); if (is_long_help) - fprintf(pysamerr, "Notes:\n\ -\n\ - 1. Requires embedded reference sequences (before the reads for that reference),\n\ - or ideally a FASTA file of the padded reference sequences (via the -T argument).\n\ -\n\ - 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\ -\n"); + fprintf(pysam_stderr, +"Notes:\n" +"\n" +"1. Requires embedded reference sequences (before the reads for that reference),\n" +" or ideally a FASTA file of the padded reference sequences (via a -T option).\n" +"\n" +"2. Input padded alignment reads' CIGAR strings must not use P or I operators.\n" +"\n"); return 1; } diff --git a/samtools/phase.c b/samtools/phase.c index 0667ea5..6909912 100644 --- a/samtools/phase.c +++ b/samtools/phase.c @@ -1,7 +1,7 @@ /* phase.c -- phase subcommand. Copyright (C) 2011 Broad Institute. - Copyright (C) 2013, 2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Heng Li @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -30,8 +32,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include "htslib/sam.h" +#include "htslib/kstring.h" #include "errmod.h" #include "sam_opts.h" +#include "samtools.h" #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -53,6 +57,7 @@ typedef struct { samFile* fp; bam_hdr_t* fp_hdr; char *pre; + char *out_name[3]; samFile* out[3]; bam_hdr_t* out_hdr[3]; // alignment queue @@ -333,7 +338,7 @@ static int clean_seqs(int vpos, nseq_t *hash) return ret; } -static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) +static int dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) { int i, is_flip, drop_ambi; drop_ambi = g->flag & FLAG_DROP_AMBI; @@ -361,12 +366,16 @@ static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) if (which < 2 && is_flip) which = 1 - which; // increase the randomness } if (which == 3) which = (drand48() < 0.5); - sam_write1(g->out[which], g->out_hdr[which], b); + if (sam_write1(g->out[which], g->out_hdr[which], b) < 0) { + print_error_errno("phase", "error writing to '%s'", g->out_name[which]); + return -1; + } bam_destroy1(b); g->b[i] = 0; } memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); g->n -= i; + return 0; } static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) @@ -393,7 +402,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * else f->phased = 1, f->phase = f->seq[0] - 1; } } - dump_aln(g, min_pos, hash); + if (dump_aln(g, min_pos, hash) < 0) return -1; ++g->vpos_shift; return 1; } @@ -451,7 +460,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * printf("//\n"); fflush(stdout); g->vpos_shift += vpos; - dump_aln(g, min_pos, hash); + if (dump_aln(g, min_pos, hash) < 0) return -1; return vpos; } @@ -536,6 +545,26 @@ static int gl2cns(float q[16]) return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; } +static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat *fmt) +{ + kstring_t s = { 0, 0, NULL }; + ksprintf(&s, "%s.%s.%s", g->pre, middle, hts_format_file_extension(fmt)); + g->out_name[c] = ks_release(&s); + g->out[c] = sam_open_format(g->out_name[c], "wb", fmt); + if (! g->out[c]) { + print_error_errno("phase", "Failed to open output file '%s'", g->out_name[c]); + return -1; + } + + g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); + if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { + print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); + return -1; + } + + return 0; +} + int main_phase(int argc, char *argv[]) { int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0; @@ -555,6 +584,8 @@ int main_phase(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; + // FIXME Leaks galore in the case of error returns + memset(&g, 0, sizeof(phaseg_t)); g.flag = FLAG_FIX_CHIMERA; g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; @@ -594,9 +625,14 @@ int main_phase(int argc, char *argv[]) return 1; } g.fp = sam_open_format(argv[optind], "r", &ga.in); + if (!g.fp) { + print_error_errno("phase", "Couldn't open '%s'", argv[optind]); + return 1; + } g.fp_hdr = sam_hdr_read(g.fp); if (g.fp_hdr == NULL) { - fprintf(stderr, "Failed to read header for '%s'\n", argv[optind]); + fprintf(stderr, "[%s] Failed to read header for '%s'\n", + __func__, argv[optind]); return 1; } if (fn_list) { // read the list of sites to phase @@ -604,20 +640,13 @@ int main_phase(int argc, char *argv[]) free(fn_list); } else g.flag &= ~FLAG_LIST_EXCL; if (g.pre) { // open BAMs to write - char *s = (char*)malloc(strlen(g.pre) + 20); if (ga.out.format == unknown_format) ga.out.format = bam; // default via "wb". - strcpy(s, g.pre); strcat(s, ".0."); strcat(s, hts_format_file_extension(&ga.out)); - g.out[0] = sam_open_format(s, "wb", &ga.out); - strcpy(s, g.pre); strcat(s, ".1."); strcat(s, hts_format_file_extension(&ga.out)); - g.out[1] = sam_open_format(s, "wb", &ga.out); - strcpy(s, g.pre); strcat(s, ".chimera."); strcat(s, hts_format_file_extension(&ga.out)); - g.out[2] = sam_open_format(s, "wb", &ga.out); - for (c = 0; c <= 2; ++c) { - g.out_hdr[c] = bam_hdr_dup(g.fp_hdr); - sam_hdr_write(g.out[c], g.out_hdr[c]); - } - free(s); + + // Open each output file g.out[0..2], dupping and writing the header + if (start_output(&g, 0, "0", &ga.out) < 0 || + start_output(&g, 1, "1", &ga.out) < 0 || + start_output(&g, 2, "chimera", &ga.out) < 0) return 1; } iter = bam_plp_init(readaln, &g); @@ -647,7 +676,10 @@ int main_phase(int argc, char *argv[]) g.vpos_shift = 0; if (lasttid >= 0) { seqs = shrink_hash(seqs); - phase(&g, g.fp_hdr->target_name[lasttid], vpos, cns, seqs); + if (phase(&g, g.fp_hdr->target_name[lasttid], + vpos, cns, seqs) < 0) { + return 1; + } update_vpos(0x7fffffff, seqs); } lasttid = tid; @@ -716,14 +748,20 @@ int main_phase(int argc, char *argv[]) } if (dophase) { seqs = shrink_hash(seqs); - phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs); + if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { + return 1; + } update_vpos(vpos, seqs); cns[0] = cns[vpos]; vpos = 0; } ++vpos; } - if (tid >= 0) phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs); + if (tid >= 0) { + if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { + return 1; + } + } bam_hdr_destroy(g.fp_hdr); bam_plp_destroy(iter); sam_close(g.fp); @@ -733,11 +771,18 @@ int main_phase(int argc, char *argv[]) errmod_destroy(em); free(bases); if (g.pre) { + int res = 0; for (c = 0; c <= 2; ++c) { - sam_close(g.out[c]); + if (sam_close(g.out[c]) < 0) { + fprintf(stderr, "[%s] error on closing '%s'\n", + __func__, g.out_name[c]); + res = 1; + } bam_hdr_destroy(g.out_hdr[c]); + free(g.out_name[c]); } free(g.pre); free(g.b); + if (res) return 1; } sam_global_args_free(&ga); return 0; diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index bc1d455..3babd37 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -3,7 +3,7 @@ /* phase.c -- phase subcommand. Copyright (C) 2011 Broad Institute. - Copyright (C) 2013, 2014 Genome Research Ltd. + Copyright (C) 2013-2016 Genome Research Ltd. Author: Heng Li @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -32,8 +34,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include "htslib/sam.h" +#include "htslib/kstring.h" #include "errmod.h" #include "sam_opts.h" +#include "samtools.h" #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -55,6 +59,7 @@ typedef struct { samFile* fp; bam_hdr_t* fp_hdr; char *pre; + char *out_name[3]; samFile* out[3]; bam_hdr_t* out_hdr[3]; // alignment queue @@ -335,7 +340,7 @@ static int clean_seqs(int vpos, nseq_t *hash) return ret; } -static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) +static int dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) { int i, is_flip, drop_ambi; drop_ambi = g->flag & FLAG_DROP_AMBI; @@ -363,12 +368,16 @@ static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) if (which < 2 && is_flip) which = 1 - which; // increase the randomness } if (which == 3) which = (drand48() < 0.5); - sam_write1(g->out[which], g->out_hdr[which], b); + if (sam_write1(g->out[which], g->out_hdr[which], b) < 0) { + print_error_errno("phase", "error writing to '%s'", g->out_name[which]); + return -1; + } bam_destroy1(b); g->b[i] = 0; } memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); g->n -= i; + return 0; } static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) @@ -383,8 +392,8 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos min_pos = i? cns[vpos]>>32 : 0x7fffffff; if (vpos == 1) { - printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); - printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, + fprintf(pysam_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); + fprintf(pysam_stdout, "M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); for (k = 0; k < kh_end(hash); ++k) { if (kh_exist(hash, k)) { @@ -395,14 +404,14 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * else f->phased = 1, f->phase = f->seq[0] - 1; } } - dump_aln(g, min_pos, hash); + if (dump_aln(g, min_pos, hash) < 0) return -1; ++g->vpos_shift; return 1; } { // phase int **cnt; uint64_t *mask; - printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); + fprintf(pysam_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); sitemask = calloc(vpos, 1); cnt = count_all(g->k, vpos, hash); path = dynaprog(g->k, vpos, cnt); @@ -423,13 +432,13 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * } } for (i = 0; i < n_masked; ++i) - printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); + fprintf(pysam_stdout, "FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); for (i = 0; i < vpos; ++i) { uint64_t x = pcnt[i]; int8_t c[2]; c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); - printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], + fprintf(pysam_stdout, "M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); } free(path); free(pcnt); free(regmask); free(sitemask); @@ -441,19 +450,19 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * ks_introsort_rseq(n_seqs, seqs); for (i = 0; i < n_seqs; ++i) { frag_t *f = seqs[i]; - printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); + fprintf(pysam_stdout, "EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); for (j = 0; j < f->vlen; ++j) { uint32_t c = cns[f->vpos + j]; - if (f->seq[j] == 0) putchar('N'); - else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); + if (f->seq[j] == 0) fputc('N', pysam_stdout); + else fputc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], pysam_stdout); } - printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); + fprintf(pysam_stdout, "\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); } free(seqs); - printf("//\n"); - fflush(stdout); + fprintf(pysam_stdout, "//\n"); + fflush(pysam_stdout); g->vpos_shift += vpos; - dump_aln(g, min_pos, hash); + if (dump_aln(g, min_pos, hash) < 0) return -1; return vpos; } @@ -538,6 +547,26 @@ static int gl2cns(float q[16]) return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; } +static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat *fmt) +{ + kstring_t s = { 0, 0, NULL }; + ksprintf(&s, "%s.%s.%s", g->pre, middle, hts_format_file_extension(fmt)); + g->out_name[c] = ks_release(&s); + g->out[c] = sam_open_format(g->out_name[c], "wb", fmt); + if (! g->out[c]) { + print_error_errno("phase", "Failed to open output file '%s'", g->out_name[c]); + return -1; + } + + g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); + if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { + print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); + return -1; + } + + return 0; +} + int main_phase(int argc, char *argv[]) { int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0; @@ -557,6 +586,8 @@ int main_phase(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; + // FIXME Leaks galore in the case of error returns + memset(&g, 0, sizeof(phaseg_t)); g.flag = FLAG_FIX_CHIMERA; g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; @@ -578,27 +609,32 @@ int main_phase(int argc, char *argv[]) if (usage) break; } if (usage || argc == optind) { - fprintf(pysamerr, "\n"); - fprintf(pysamerr, "Usage: samtools phase [options] \n\n"); - fprintf(pysamerr, "Options: -k INT block length [%d]\n", g.k); - fprintf(pysamerr, " -b STR prefix of BAMs to output [null]\n"); - fprintf(pysamerr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); - fprintf(pysamerr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); - fprintf(pysamerr, " -D INT max read depth [%d]\n", g.max_depth); -// fprintf(pysamerr, " -l FILE list of sites to phase [null]\n"); - fprintf(pysamerr, " -F do not attempt to fix chimeras\n"); - fprintf(pysamerr, " -A drop reads with ambiguous phase\n"); -// fprintf(pysamerr, " -e do not discover SNPs (effective with -l)\n"); - fprintf(pysamerr, "\n"); - - sam_global_opt_help(pysamerr, "-...."); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Usage: samtools phase [options] \n\n"); + fprintf(pysam_stderr, "Options: -k INT block length [%d]\n", g.k); + fprintf(pysam_stderr, " -b STR prefix of BAMs to output [null]\n"); + fprintf(pysam_stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); + fprintf(pysam_stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(pysam_stderr, " -D INT max read depth [%d]\n", g.max_depth); +// fprintf(pysam_stderr, " -l FILE list of sites to phase [null]\n"); + fprintf(pysam_stderr, " -F do not attempt to fix chimeras\n"); + fprintf(pysam_stderr, " -A drop reads with ambiguous phase\n"); +// fprintf(pysam_stderr, " -e do not discover SNPs (effective with -l)\n"); + fprintf(pysam_stderr, "\n"); + + sam_global_opt_help(pysam_stderr, "-...."); return 1; } g.fp = sam_open_format(argv[optind], "r", &ga.in); + if (!g.fp) { + print_error_errno("phase", "Couldn't open '%s'", argv[optind]); + return 1; + } g.fp_hdr = sam_hdr_read(g.fp); if (g.fp_hdr == NULL) { - fprintf(pysamerr, "Failed to read header for '%s'\n", argv[optind]); + fprintf(pysam_stderr, "[%s] Failed to read header for '%s'\n", + __func__, argv[optind]); return 1; } if (fn_list) { // read the list of sites to phase @@ -606,20 +642,13 @@ int main_phase(int argc, char *argv[]) free(fn_list); } else g.flag &= ~FLAG_LIST_EXCL; if (g.pre) { // open BAMs to write - char *s = (char*)malloc(strlen(g.pre) + 20); if (ga.out.format == unknown_format) ga.out.format = bam; // default via "wb". - strcpy(s, g.pre); strcat(s, ".0."); strcat(s, hts_format_file_extension(&ga.out)); - g.out[0] = sam_open_format(s, "wb", &ga.out); - strcpy(s, g.pre); strcat(s, ".1."); strcat(s, hts_format_file_extension(&ga.out)); - g.out[1] = sam_open_format(s, "wb", &ga.out); - strcpy(s, g.pre); strcat(s, ".chimera."); strcat(s, hts_format_file_extension(&ga.out)); - g.out[2] = sam_open_format(s, "wb", &ga.out); - for (c = 0; c <= 2; ++c) { - g.out_hdr[c] = bam_hdr_dup(g.fp_hdr); - sam_hdr_write(g.out[c], g.out_hdr[c]); - } - free(s); + + // Open each output file g.out[0..2], dupping and writing the header + if (start_output(&g, 0, "0", &ga.out) < 0 || + start_output(&g, 1, "1", &ga.out) < 0 || + start_output(&g, 2, "chimera", &ga.out) < 0) return 1; } iter = bam_plp_init(readaln, &g); @@ -627,20 +656,20 @@ int main_phase(int argc, char *argv[]) seqs = kh_init(64); em = errmod_init(1. - 0.83); bases = calloc(g.max_depth, 2); - printf("CC\n"); - printf("CC\tDescriptions:\nCC\n"); - printf("CC\t CC comments\n"); - printf("CC\t PS start of a phase set\n"); - printf("CC\t FL filtered region\n"); - printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); - printf("CC\t EV supporting reads; SAM format\n"); - printf("CC\t // end of a phase set\nCC\n"); - printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); - printf("CC\t PS chr phaseSetStart phaseSetEnd\n"); - printf("CC\t FL chr filterStart filterEnd\n"); - printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); - printf("CC\nCC\n"); - fflush(stdout); + fprintf(pysam_stdout, "CC\n"); + fprintf(pysam_stdout, "CC\tDescriptions:\nCC\n"); + fprintf(pysam_stdout, "CC\t CC comments\n"); + fprintf(pysam_stdout, "CC\t PS start of a phase set\n"); + fprintf(pysam_stdout, "CC\t FL filtered region\n"); + fprintf(pysam_stdout, "CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); + fprintf(pysam_stdout, "CC\t EV supporting reads; SAM format\n"); + fprintf(pysam_stdout, "CC\t // end of a phase set\nCC\n"); + fprintf(pysam_stdout, "CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); + fprintf(pysam_stdout, "CC\t PS chr phaseSetStart phaseSetEnd\n"); + fprintf(pysam_stdout, "CC\t FL chr filterStart filterEnd\n"); + fprintf(pysam_stdout, "CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); + fprintf(pysam_stdout, "CC\nCC\n"); + fflush(pysam_stdout); while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { int i, k, c, tmp, dophase = 1, in_set = 0; float q[16]; @@ -649,7 +678,10 @@ int main_phase(int argc, char *argv[]) g.vpos_shift = 0; if (lasttid >= 0) { seqs = shrink_hash(seqs); - phase(&g, g.fp_hdr->target_name[lasttid], vpos, cns, seqs); + if (phase(&g, g.fp_hdr->target_name[lasttid], + vpos, cns, seqs) < 0) { + return 1; + } update_vpos(0x7fffffff, seqs); } lasttid = tid; @@ -718,14 +750,20 @@ int main_phase(int argc, char *argv[]) } if (dophase) { seqs = shrink_hash(seqs); - phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs); + if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { + return 1; + } update_vpos(vpos, seqs); cns[0] = cns[vpos]; vpos = 0; } ++vpos; } - if (tid >= 0) phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs); + if (tid >= 0) { + if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { + return 1; + } + } bam_hdr_destroy(g.fp_hdr); bam_plp_destroy(iter); sam_close(g.fp); @@ -735,11 +773,18 @@ int main_phase(int argc, char *argv[]) errmod_destroy(em); free(bases); if (g.pre) { + int res = 0; for (c = 0; c <= 2; ++c) { - sam_close(g.out[c]); + if (sam_close(g.out[c]) < 0) { + fprintf(pysam_stderr, "[%s] error on closing '%s'\n", + __func__, g.out_name[c]); + res = 1; + } bam_hdr_destroy(g.out_hdr[c]); + free(g.out_name[c]); } free(g.pre); free(g.b); + if (res) return 1; } sam_global_args_free(&ga); return 0; diff --git a/samtools/pysam.h b/samtools/pysam.h index 008cbbd..b0fc4fb 100644 --- a/samtools/pysam.h +++ b/samtools/pysam.h @@ -1,5 +1,7 @@ #ifndef PYSAM_H #define PYSAM_H #include "stdio.h" -extern FILE * pysamerr; +extern FILE * pysam_stderr; +extern FILE * pysam_stdout; +extern const char * pysam_stdout_fn; #endif diff --git a/samtools/sam.c b/samtools/sam.c index d6cc9f6..237c3e8 100644 --- a/samtools/sam.c +++ b/samtools/sam.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "htslib/faidx.h" @@ -31,7 +33,7 @@ DEALINGS IN THE SOFTWARE. */ int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) { if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1; - bgzf_mt(fp->x.bam, n_threads, n_sub_blks); + if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1; return 0; } @@ -42,6 +44,10 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (hts_fp == NULL) return NULL; samfile_t *fp = malloc(sizeof (samfile_t)); + if (!fp) { + sam_close(hts_fp); + return NULL; + } fp->file = hts_fp; fp->x.bam = hts_fp->fp.bgzf; if (strchr(mode, 'r')) { @@ -66,7 +72,15 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) enum htsExactFormat fmt = hts_get_format(fp->file)->format; fp->header = (bam_hdr_t *)aux; // For writing, we won't free it fp->is_write = 1; - if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header); + if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { + if (sam_hdr_write(fp->file, fp->header) < 0) { + if (bam_verbose >= 1) + fprintf(stderr, "[samopen] Couldn't write header\n"); + sam_close(hts_fp); + free(fp); + return NULL; + } + } } return fp; diff --git a/samtools/sam.c.pysam.c b/samtools/sam.c.pysam.c index e7c4cac..f7db820 100644 --- a/samtools/sam.c.pysam.c +++ b/samtools/sam.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "htslib/faidx.h" @@ -33,7 +35,7 @@ DEALINGS IN THE SOFTWARE. */ int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) { if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1; - bgzf_mt(fp->x.bam, n_threads, n_sub_blks); + if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1; return 0; } @@ -44,6 +46,10 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (hts_fp == NULL) return NULL; samfile_t *fp = malloc(sizeof (samfile_t)); + if (!fp) { + sam_close(hts_fp); + return NULL; + } fp->file = hts_fp; fp->x.bam = hts_fp->fp.bgzf; if (strchr(mode, 'r')) { @@ -62,13 +68,21 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) } fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) - fprintf(pysamerr, "[samopen] no @SQ lines in the header.\n"); + fprintf(pysam_stderr, "[samopen] no @SQ lines in the header.\n"); } else { enum htsExactFormat fmt = hts_get_format(fp->file)->format; fp->header = (bam_hdr_t *)aux; // For writing, we won't free it fp->is_write = 1; - if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header); + if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { + if (sam_hdr_write(fp->file, fp->header) < 0) { + if (bam_verbose >= 1) + fprintf(pysam_stderr, "[samopen] Couldn't write header\n"); + sam_close(hts_fp); + free(fp); + return NULL; + } + } } return fp; @@ -122,11 +136,11 @@ char *samfaipath(const char *fn_ref) strcat(strcpy(fn_list, fn_ref), ".fai"); if (access(fn_list, R_OK) == -1) { // fn_list is unreadable if (access(fn_ref, R_OK) == -1) { - fprintf(pysamerr, "[samfaipath] fail to read file %s.\n", fn_ref); + fprintf(pysam_stderr, "[samfaipath] fail to read file %s.\n", fn_ref); } else { - if (bam_verbose >= 3) fprintf(pysamerr, "[samfaipath] build FASTA index...\n"); + if (bam_verbose >= 3) fprintf(pysam_stderr, "[samfaipath] build FASTA index...\n"); if (fai_build(fn_ref) == -1) { - fprintf(pysamerr, "[samfaipath] fail to build FASTA index.\n"); + fprintf(pysam_stderr, "[samfaipath] fail to build FASTA index.\n"); free(fn_list); fn_list = 0; } } diff --git a/samtools/sam_header.c b/samtools/sam_header.c index 75ca724..64da68f 100644 --- a/samtools/sam_header.c +++ b/samtools/sam_header.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "sam_header.h" #include #include diff --git a/samtools/sam_header.c.pysam.c b/samtools/sam_header.c.pysam.c index ecf937c..e39807d 100644 --- a/samtools/sam_header.c.pysam.c +++ b/samtools/sam_header.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "sam_header.h" #include #include @@ -81,7 +83,7 @@ static void debug(const char *format, ...) { va_list ap; va_start(ap, format); - vfprintf(pysamerr, format, ap); + vfprintf(pysam_stderr, format, ap); va_end(ap); } @@ -775,8 +777,8 @@ void *sam_header_merge(int n, const void **_dicts) if ( status==2 ) { - print_header_line(pysamerr,tmpl_hlines->data); - print_header_line(pysamerr,out_hlines->data); + print_header_line(pysam_stderr,tmpl_hlines->data); + print_header_line(pysam_stderr,out_hlines->data); debug("Conflicting lines, cannot merge the headers.\n"); return 0; } diff --git a/samtools/sam_opts.c b/samtools/sam_opts.c index 0ed197e..9369145 100644 --- a/samtools/sam_opts.c +++ b/samtools/sam_opts.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/sam_opts.c.pysam.c b/samtools/sam_opts.c.pysam.c index c976438..d0b56a3 100644 --- a/samtools/sam_opts.c.pysam.c +++ b/samtools/sam_opts.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -79,7 +81,7 @@ int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt, } if (!lopt->name) { - fprintf(pysamerr, "Unexpected global option: %s\n", lopt->name); + fprintf(pysam_stderr, "Unexpected global option: %s\n", lopt->name); return -1; } diff --git a/samtools/sam_view.c b/samtools/sam_view.c index 4358a1c..402e1d3 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -96,7 +98,7 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); - if (p && strcmp(p, settings->library) != 0) return 1; + if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; @@ -400,18 +402,18 @@ int main_samview(int argc, char *argv[]) } } if (fn_un_out) { - if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { + if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(un_out, fn_list) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); - ret = 1; - goto view_end; - } + if (fn_list) { + if (hts_set_fai_filename(un_out, fn_list) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + ret = 1; + goto view_end; } + } if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { @@ -556,35 +558,37 @@ static int usage(FILE *fp, int exit_status, int is_long_help) fprintf(fp, "Notes:\n" "\n" -" 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" -" Further control over the CRAM format can be specified by using the\n" -" --output-fmt-option, e.g. to specify the number of sequences per slice\n" -" and to use avoid reference based compression:\n" -" `samtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" -" --output-fmt-option no_ref -o out.cram in.bam'\n" +"1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" +" Further control over the CRAM format can be specified by using the\n" +" --output-fmt-option, e.g. to specify the number of sequences per slice\n" +" and to use avoid reference based compression:\n" "\n" -" Options can also be specified as a comma separated list within the\n" -" --output-fmt value too. For example this is equivalent to the above\n" -" `samtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" -" -o out.cram in.bam'\n" +"\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" +"\t --output-fmt-option no_ref -o out.cram in.bam\n" "\n" -" 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" -" two fields of each line consisting of the reference name and the\n" -" corresponding sequence length. The `.fai' file generated by \n" -" `samtools faidx' is suitable for use as this file. This may be an\n" -" empty file if reads are unaligned.\n" +" Options can also be specified as a comma separated list within the\n" +" --output-fmt value too. For example this is equivalent to the above\n" "\n" -" 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n" +"\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" +"\t -o out.cram in.bam\n" "\n" -" 4. BAM->SAM conversion: `samtools view -h in.bam'.\n" +"2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" +" two fields of each line consisting of the reference name and the\n" +" corresponding sequence length. The `.fai' file generated by \n" +" `samtools faidx' is suitable for use as this file. This may be an\n" +" empty file if reads are unaligned.\n" "\n" -" 5. A region should be presented in one of the following formats:\n" -" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" -" specified, the input alignment file must be a sorted and indexed\n" -" alignment (BAM/CRAM) file.\n" +"3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n" "\n" -" 6. Option `-u' is preferred over `-b' when the output is piped to\n" -" another samtools command.\n" +"4. BAM->SAM conversion: samtools view -h in.bam\n" +"\n" +"5. A region should be presented in one of the following formats:\n" +" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" +" specified, the input alignment file must be a sorted and indexed\n" +" alignment (BAM/CRAM) file.\n" +"\n" +"6. Option `-u' is preferred over `-b' when the output is piped to\n" +" another samtools command.\n" "\n"); return exit_status; @@ -611,6 +615,7 @@ static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; static void bam2fq_usage(FILE *to, const char *command) { + int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; fprintf(to, "Usage: samtools %s [options...] \n", command); fprintf(to, @@ -620,10 +625,14 @@ static void bam2fq_usage(FILE *to, const char *command) " -2 FILE write paired reads flagged READ2 to FILE\n" " -f INT only include reads with all bits set in INT set in FLAG [0]\n" " -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" -" -n don't append /1 and /2 to the read name\n" -" -O output quality in the OQ tag if present\n" +" -n don't append /1 and /2 to the read name\n"); + if (fq) fprintf(to, +" -O output quality in the OQ tag if present\n"); + fprintf(to, " -s FILE write singleton reads to FILE [assume single-end]\n" -" -t copy RG, BC and QT tags to the FASTQ header line\n" +" -t copy RG, BC and QT tags to the %s header line\n", + fq ? "FASTQ" : "FASTA"); + if (fq) fprintf(to, " -v INT default quality score if not given in file [1]\n"); sam_global_opt_help(to, "-.--."); } @@ -673,7 +682,10 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t uint8_t *seq; uint8_t *qual = bam_get_qual(b); const uint8_t *oq = NULL; - if (state->use_oq) oq = bam_aux_get(b, "OQ") + 1; + if (state->use_oq) { + oq = bam_aux_get(b, "OQ"); + if (oq) oq++; // skip tag type + } bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality linebuf->l = 0; @@ -921,7 +933,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) bool valid = true; while (true) { - at_eof = sam_read1(state->fp, state->h, b); + at_eof = sam_read1(state->fp, state->h, b) < 0; if (!at_eof && filter_it_out(b, state)) continue; if (!at_eof) ++n_reads; diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index dfc8065..3d5ffa5 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -98,7 +100,7 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); - if (p && strcmp(p, settings->library) != 0) return 1; + if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; @@ -317,8 +319,8 @@ int main_samview(int argc, char *argv[]) case 'x': { if (strlen(optarg) != 2) { - fprintf(pysamerr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); - return usage(pysamerr, EXIT_FAILURE, is_long_help); + fprintf(pysam_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); + return usage(pysam_stderr, EXIT_FAILURE, is_long_help); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); settings.remove_aux[settings.remove_aux_len-1] = optarg; @@ -327,7 +329,7 @@ int main_samview(int argc, char *argv[]) default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) - return usage(pysamerr, EXIT_FAILURE, is_long_help); + return usage(pysam_stderr, EXIT_FAILURE, is_long_help); break; } } @@ -347,7 +349,7 @@ int main_samview(int argc, char *argv[]) strcat(out_mode, tmp); strcat(out_un_mode, tmp); } - if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... + if (argc == optind && isatty(STDIN_FILENO)) return usage(pysam_stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... fn_in = (optind < argc)? argv[optind] : "-"; // generate the fn_list if necessary @@ -361,13 +363,13 @@ int main_samview(int argc, char *argv[]) if (fn_list) { if (hts_set_fai_filename(in, fn_list) != 0) { - fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if ((header = sam_hdr_read(in)) == 0) { - fprintf(pysamerr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); + fprintf(pysam_stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); ret = 1; goto view_end; } @@ -387,7 +389,7 @@ int main_samview(int argc, char *argv[]) } if (fn_list) { if (hts_set_fai_filename(out, fn_list) != 0) { - fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } @@ -396,29 +398,29 @@ int main_samview(int argc, char *argv[]) out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(out, header) != 0) { - fprintf(pysamerr, "[main_samview] failed to write the SAM header\n"); + fprintf(pysam_stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (fn_un_out) { - if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { + if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(un_out, fn_list) != 0) { - fprintf(pysamerr, "[main_samview] failed to use reference \"%s\".\n", fn_list); - ret = 1; - goto view_end; - } + if (fn_list) { + if (hts_set_fai_filename(un_out, fn_list) != 0) { + fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + ret = 1; + goto view_end; } + } if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(un_out, header) != 0) { - fprintf(pysamerr, "[main_samview] failed to write the SAM header\n"); + fprintf(pysam_stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } @@ -441,7 +443,7 @@ int main_samview(int argc, char *argv[]) } } if (r < -1) { - fprintf(pysamerr, "[main_samview] truncated file.\n"); + fprintf(pysam_stderr, "[main_samview] truncated file.\n"); ret = 1; } bam_destroy1(b); @@ -450,7 +452,7 @@ int main_samview(int argc, char *argv[]) bam1_t *b; hts_idx_t *idx = sam_index_load(in, fn_in); // load index if (idx == 0) { // index is unavailable - fprintf(pysamerr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); + fprintf(pysam_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ret = 1; goto view_end; } @@ -461,9 +463,9 @@ int main_samview(int argc, char *argv[]) if (iter == NULL) { // region invalid or reference name not found int beg, end; if (hts_parse_reg(argv[i], &beg, &end)) - fprintf(pysamerr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); + fprintf(pysam_stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); else - fprintf(pysamerr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); + fprintf(pysam_stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); continue; } // fetch alignments @@ -477,7 +479,7 @@ int main_samview(int argc, char *argv[]) } hts_itr_destroy(iter); if (result < -1) { - fprintf(pysamerr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + fprintf(pysam_stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); ret = 1; break; } @@ -488,7 +490,7 @@ int main_samview(int argc, char *argv[]) view_end: if (is_count && ret == 0) - printf("%" PRId64 "\n", count); + fprintf(pysam_stdout, "%" PRId64 "\n", count); // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); @@ -526,7 +528,7 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -h include header in SAM output\n" " -H print SAM header only (no alignments)\n" " -c print only the count of matching records\n" -" -o FILE output file name [stdout]\n" +" -o FILE output file name [pysam_stdout]\n" " -U FILE output reads not selected by filters to FILE [null]\n" // extra input " -t FILE FILE listing reference names and lengths (see long help) [null]\n" @@ -558,35 +560,37 @@ static int usage(FILE *fp, int exit_status, int is_long_help) fprintf(fp, "Notes:\n" "\n" -" 1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" -" Further control over the CRAM format can be specified by using the\n" -" --output-fmt-option, e.g. to specify the number of sequences per slice\n" -" and to use avoid reference based compression:\n" -" `samtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" -" --output-fmt-option no_ref -o out.cram in.bam'\n" +"1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" +" Further control over the CRAM format can be specified by using the\n" +" --output-fmt-option, e.g. to specify the number of sequences per slice\n" +" and to use avoid reference based compression:\n" "\n" -" Options can also be specified as a comma separated list within the\n" -" --output-fmt value too. For example this is equivalent to the above\n" -" `samtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" -" -o out.cram in.bam'\n" +"\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" +"\t --output-fmt-option no_ref -o out.cram in.bam\n" "\n" -" 2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" -" two fields of each line consisting of the reference name and the\n" -" corresponding sequence length. The `.fai' file generated by \n" -" `samtools faidx' is suitable for use as this file. This may be an\n" -" empty file if reads are unaligned.\n" +" Options can also be specified as a comma separated list within the\n" +" --output-fmt value too. For example this is equivalent to the above\n" "\n" -" 3. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n" +"\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" +"\t -o out.cram in.bam\n" "\n" -" 4. BAM->SAM conversion: `samtools view -h in.bam'.\n" +"2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" +" two fields of each line consisting of the reference name and the\n" +" corresponding sequence length. The `.fai' file generated by \n" +" `samtools faidx' is suitable for use as this file. This may be an\n" +" empty file if reads are unaligned.\n" "\n" -" 5. A region should be presented in one of the following formats:\n" -" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" -" specified, the input alignment file must be a sorted and indexed\n" -" alignment (BAM/CRAM) file.\n" +"3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n" "\n" -" 6. Option `-u' is preferred over `-b' when the output is piped to\n" -" another samtools command.\n" +"4. BAM->SAM conversion: samtools view -h in.bam\n" +"\n" +"5. A region should be presented in one of the following formats:\n" +" `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" +" specified, the input alignment file must be a sorted and indexed\n" +" alignment (BAM/CRAM) file.\n" +"\n" +"6. Option `-u' is preferred over `-b' when the output is piped to\n" +" another samtools command.\n" "\n"); return exit_status; @@ -597,7 +601,7 @@ int main_import(int argc, char *argv[]) int argc2, ret; char **argv2; if (argc != 4) { - fprintf(pysamerr, "Usage: samtools import \n"); + fprintf(pysam_stderr, "Usage: samtools import \n"); return 1; } argc2 = 6; @@ -613,6 +617,7 @@ static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; static void bam2fq_usage(FILE *to, const char *command) { + int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; fprintf(to, "Usage: samtools %s [options...] \n", command); fprintf(to, @@ -622,10 +627,14 @@ static void bam2fq_usage(FILE *to, const char *command) " -2 FILE write paired reads flagged READ2 to FILE\n" " -f INT only include reads with all bits set in INT set in FLAG [0]\n" " -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" -" -n don't append /1 and /2 to the read name\n" -" -O output quality in the OQ tag if present\n" +" -n don't append /1 and /2 to the read name\n"); + if (fq) fprintf(to, +" -O output quality in the OQ tag if present\n"); + fprintf(to, " -s FILE write singleton reads to FILE [assume single-end]\n" -" -t copy RG, BC and QT tags to the FASTQ header line\n" +" -t copy RG, BC and QT tags to the %s header line\n", + fq ? "FASTQ" : "FASTA"); + if (fq) fprintf(to, " -v INT default quality score if not given in file [1]\n"); sam_global_opt_help(to, "-.--."); } @@ -675,7 +684,10 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t uint8_t *seq; uint8_t *qual = bam_get_qual(b); const uint8_t *oq = NULL; - if (state->use_oq) oq = bam_aux_get(b, "OQ") + 1; + if (state->use_oq) { + oq = bam_aux_get(b, "OQ"); + if (oq) oq++; // skip tag type + } bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality linebuf->l = 0; @@ -776,10 +788,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case 's': opts->fnse = optarg; break; case 't': opts->copy_tags = true; break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(pysamerr, argv[0]); free(opts); return false; + case '?': bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(pysamerr, argv[0]); free(opts); return false; + bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false; } break; } @@ -788,8 +800,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; if (opts->def_qual < 0 || 93 < opts->def_qual) { - fprintf(pysamerr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); - bam2fq_usage(pysamerr, argv[0]); + fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); + bam2fq_usage(pysam_stderr, argv[0]); free(opts); return true; } @@ -801,20 +813,20 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) opts->filetype = FASTA; } else { print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); - bam2fq_usage(pysamerr, argv[0]); + bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false; } if ((argc - (optind)) == 0) { - bam2fq_usage(stdout, argv[0]); + bam2fq_usage(pysam_stdout, argv[0]); free(opts); return false; } if ((argc - (optind)) != 1) { - fprintf(pysamerr, "Too many arguments.\n"); - bam2fq_usage(pysamerr, argv[0]); + fprintf(pysam_stderr, "Too many arguments.\n"); + bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false; } @@ -843,12 +855,12 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; if (opts->use_oq) rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); return false; } if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); free(state); return false; } @@ -871,13 +883,13 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } } else { - state->fpr[i] = stdout; + state->fpr[i] = pysam_stdout; } } state->h = sam_hdr_read(state->fp); if (state->h == NULL) { - fprintf(pysamerr, "Failed to read header for \"%s\"\n", opts->fn_input); + fprintf(pysam_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); free(state); return false; } @@ -894,7 +906,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } int i; for (i = 0; i < 3; ++i) { - if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } + if (state->fpr[i] != pysam_stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } free(state); return valid; @@ -923,7 +935,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) bool valid = true; while (true) { - at_eof = sam_read1(state->fp, state->h, b); + at_eof = sam_read1(state->fp, state->h, b) < 0; if (!at_eof && filter_it_out(b, state)) continue; if (!at_eof) ++n_reads; @@ -960,7 +972,7 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; if (b_score > score[which_readpart(b)]) { if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { - fprintf(pysamerr, "[%s] Error converting read to FASTA/Q\n", __func__); + fprintf(pysam_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); return false; } score[which_readpart(b)] = b_score; @@ -975,8 +987,8 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) free(linebuf[0].s); free(linebuf[1].s); free(linebuf[2].s); - fprintf(pysamerr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); - fprintf(pysamerr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); + fprintf(pysam_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); + fprintf(pysam_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); return valid; } @@ -1003,7 +1015,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state) free(linebuf.s); bam_destroy1(b); - fprintf(pysamerr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); + fprintf(pysam_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); return true; } diff --git a/samtools/sample.c b/samtools/sample.c index aa38132..4cc89ce 100644 --- a/samtools/sample.c +++ b/samtools/sample.c @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "sample.h" diff --git a/samtools/sample.c.pysam.c b/samtools/sample.c.pysam.c index 73ec01f..dff8188 100644 --- a/samtools/sample.c.pysam.c +++ b/samtools/sample.c.pysam.c @@ -25,6 +25,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include "sample.h" diff --git a/samtools/stats.c b/samtools/stats.c index 512df1d..eb6bb52 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -37,6 +37,8 @@ DEALINGS IN THE SOFTWARE. */ */ +#include + #include // for isatty() #include #include diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index e30b2ad..da187ac 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -39,6 +39,8 @@ DEALINGS IN THE SOFTWARE. */ */ +#include + #include // for isatty() #include #include @@ -1240,7 +1242,7 @@ void init_regions(stats_t *stats, const char *file) if ( tid < 0 ) { if ( !warned ) - fprintf(pysamerr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s); + fprintf(pysam_stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s); warned = 1; continue; } @@ -1334,7 +1336,7 @@ void init_group_id(stats_t *stats, const char *id) { khiter_t k = kh_get(kh_rg, stats->rg_hash, key); if ( k != kh_end(stats->rg_hash) ) - fprintf(pysamerr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + fprintf(pysam_stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); int ret; k = kh_put(kh_rg, stats->rg_hash, key, &ret); kh_value(stats->rg_hash, k) = val; @@ -1344,7 +1346,7 @@ void init_group_id(stats_t *stats, const char *id) if ( !n ) error("The sample or read group \"%s\" not present.\n", id); #else - fprintf(pysamerr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n"); + fprintf(pysam_stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n"); abort(); #endif } @@ -1354,35 +1356,35 @@ static void error(const char *format, ...) { if ( !format ) { - printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamstats.\n"); - printf("Usage: samtools stats [OPTIONS] file.bam\n"); - printf(" samtools stats [OPTIONS] file.bam chr:from-to\n"); - printf("Options:\n"); - printf(" -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); - printf(" -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); - printf(" -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); - printf(" -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); - printf(" --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); - printf(" -h, --help This help message\n"); - printf(" -i, --insert-size Maximum insert size [8000]\n"); - printf(" -I, --id Include only listed read group or sample name\n"); - printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); - printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); - printf(" -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); - printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); - printf(" -r, --ref-seq Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n"); - printf(" -s, --sam Ignored (input format is auto-detected).\n"); - printf(" -S, --split Also write statistics to separate files split by tagged field.\n"); - printf(" -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); - printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); - sam_global_opt_help(stdout, "-.--."); - printf("\n"); + fprintf(pysam_stdout, "About: The program collects statistics from BAM files. The output can be visualized using plot-bamstats.\n"); + fprintf(pysam_stdout, "Usage: samtools stats [OPTIONS] file.bam\n"); + fprintf(pysam_stdout, " samtools stats [OPTIONS] file.bam chr:from-to\n"); + fprintf(pysam_stdout, "Options:\n"); + fprintf(pysam_stdout, " -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); + fprintf(pysam_stdout, " -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); + fprintf(pysam_stdout, " -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); + fprintf(pysam_stdout, " -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); + fprintf(pysam_stdout, " --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); + fprintf(pysam_stdout, " -h, --help This help message\n"); + fprintf(pysam_stdout, " -i, --insert-size Maximum insert size [8000]\n"); + fprintf(pysam_stdout, " -I, --id Include only listed read group or sample name\n"); + fprintf(pysam_stdout, " -l, --read-length Include in the statistics only reads with the given read length []\n"); + fprintf(pysam_stdout, " -m, --most-inserts Report only the main part of inserts [0.99]\n"); + fprintf(pysam_stdout, " -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); + fprintf(pysam_stdout, " -q, --trim-quality The BWA trimming parameter [0]\n"); + fprintf(pysam_stdout, " -r, --ref-seq Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n"); + fprintf(pysam_stdout, " -s, --sam Ignored (input format is auto-detected).\n"); + fprintf(pysam_stdout, " -S, --split Also write statistics to separate files split by tagged field.\n"); + fprintf(pysam_stdout, " -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); + fprintf(pysam_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); + sam_global_opt_help(pysam_stdout, "-.--."); + fprintf(pysam_stdout, "\n"); } else { va_list ap; va_start(ap, format); - vfprintf(pysamerr, format, ap); + vfprintf(pysam_stderr, format, ap); va_end(ap); } exit(1); @@ -1708,13 +1710,13 @@ int main_stats(int argc, char *argv[]) } if (ret < -1) { - fprintf(pysamerr, "Failure while decoding file\n"); + fprintf(pysam_stderr, "Failure while decoding file\n"); return 1; } } round_buffer_flush(all_stats, -1); - output_stats(stdout, all_stats, sparse); + output_stats(pysam_stdout, all_stats, sparse); if (info->split_tag) output_split_stats(split_hash, bam_fname, sparse); diff --git a/samtools/stats_isize.c b/samtools/stats_isize.c index e6b9dc1..3aa9c20 100644 --- a/samtools/stats_isize.c +++ b/samtools/stats_isize.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "stats_isize.h" #include diff --git a/samtools/stats_isize.c.pysam.c b/samtools/stats_isize.c.pysam.c index a25e4d7..6ae9088 100644 --- a/samtools/stats_isize.c.pysam.c +++ b/samtools/stats_isize.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include "stats_isize.h" #include @@ -94,7 +96,7 @@ static void sparse_set_f(isize_data_t data, int at, isize_insert_t field, uint64 kh_value(h, it) = rec; a->max = max(at, a->max); } else { - fprintf(pysamerr, "%s\n", "Failed to allocate memory for isize_sparse_record_t"); + fprintf(pysam_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t"); exit(11); } } else { diff --git a/samtools/test/merge/test_bam_translate.c b/samtools/test/merge/test_bam_translate.c index 854779b..6ed561e 100644 --- a/samtools/test/merge/test_bam_translate.c +++ b/samtools/test/merge/test_bam_translate.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_sort.c" #include "../test.h" #include diff --git a/samtools/test/merge/test_bam_translate.c.pysam.c b/samtools/test/merge/test_bam_translate.c.pysam.c index d11fbf8..193954d 100644 --- a/samtools/test/merge/test_bam_translate.c.pysam.c +++ b/samtools/test/merge/test_bam_translate.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_sort.c" #include "../test.h" #include @@ -33,40 +35,40 @@ DEALINGS IN THE SOFTWARE. */ #include void dump_read(bam1_t* b) { - printf("->core.tid:(%d)\n", b->core.tid); - printf("->core.pos:(%d)\n", b->core.pos); - printf("->core.bin:(%d)\n", b->core.bin); - printf("->core.qual:(%d)\n", b->core.qual); - printf("->core.l_qname:(%d)\n", b->core.l_qname); - printf("->core.flag:(%d)\n", b->core.flag); - printf("->core.n_cigar:(%d)\n", b->core.n_cigar); - printf("->core.l_qseq:(%d)\n", b->core.l_qseq); - printf("->core.mtid:(%d)\n", b->core.mtid); - printf("->core.mpos:(%d)\n", b->core.mpos); - printf("->core.isize:(%d)\n", b->core.isize); + fprintf(pysam_stdout, "->core.tid:(%d)\n", b->core.tid); + fprintf(pysam_stdout, "->core.pos:(%d)\n", b->core.pos); + fprintf(pysam_stdout, "->core.bin:(%d)\n", b->core.bin); + fprintf(pysam_stdout, "->core.qual:(%d)\n", b->core.qual); + fprintf(pysam_stdout, "->core.l_qname:(%d)\n", b->core.l_qname); + fprintf(pysam_stdout, "->core.flag:(%d)\n", b->core.flag); + fprintf(pysam_stdout, "->core.n_cigar:(%d)\n", b->core.n_cigar); + fprintf(pysam_stdout, "->core.l_qseq:(%d)\n", b->core.l_qseq); + fprintf(pysam_stdout, "->core.mtid:(%d)\n", b->core.mtid); + fprintf(pysam_stdout, "->core.mpos:(%d)\n", b->core.mpos); + fprintf(pysam_stdout, "->core.isize:(%d)\n", b->core.isize); if (b->data) { - printf("->data:"); + fprintf(pysam_stdout, "->data:"); int i; for (i = 0; i < b->l_data; ++i) { - printf("%x ", b->data[i]); + fprintf(pysam_stdout, "%x ", b->data[i]); } - printf("\n"); + fprintf(pysam_stdout, "\n"); } if (b->core.l_qname) { - printf("qname: %s\n",bam_get_qname(b)); + fprintf(pysam_stdout, "qname: %s\n",bam_get_qname(b)); } if (b->core.l_qseq) { - printf("qseq:"); + fprintf(pysam_stdout, "qseq:"); int i; for (i = 0; i < b->core.l_qseq; ++i) { - printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]); + fprintf(pysam_stdout, "%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]); } - printf("\n"); - printf("qual:"); + fprintf(pysam_stdout, "\n"); + fprintf(pysam_stdout, "qual:"); for (i = 0; i < b->core.l_qseq; ++i) { - printf("%c",bam_get_qual(b)[i]); + fprintf(pysam_stdout, "%c",bam_get_qual(b)[i]); } - printf("\n"); + fprintf(pysam_stdout, "\n"); } @@ -75,18 +77,18 @@ void dump_read(bam1_t* b) { uint8_t* aux = bam_get_aux(b); while (i < bam_get_l_aux(b)) { - printf("%.2s:%c:",aux+i,*(aux+i+2)); + fprintf(pysam_stdout, "%.2s:%c:",aux+i,*(aux+i+2)); i += 2; switch (*(aux+i)) { case 'Z': - while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; } + while (*(aux+1+i) != '\0') { putc(*(aux+1+i), pysam_stdout); ++i; } break; } - putc('\n',stdout); + putc('\n',pysam_stdout); ++i;++i; } } - printf("\n"); + fprintf(pysam_stdout, "\n"); } void trans_tbl_test_init(trans_tbl_t* tbl, int32_t n_targets) @@ -334,7 +336,7 @@ void setup_test_6(bam1_t** b_in, trans_tbl_t* tbl) { } -int main(int argc, char**argv) +int samtools_test_bam_translate_main(int argc, char**argv) { // test state const int NUM_TESTS = 6; @@ -355,30 +357,30 @@ int main(int argc, char**argv) bam1_t* b; - // Setup pysamerr redirect + // Setup pysam_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr + FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr char* tempfname = (optind < argc)? argv[optind] : "test_bam_translate.tmp"; FILE* check = NULL; // setup - if (verbose) printf("BEGIN test 1\n"); // TID test + if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // TID test trans_tbl_t tbl1; setup_test_1(&b,&tbl1); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } - if (verbose) printf("RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bam_translate(b, &tbl1); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } @@ -390,33 +392,33 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 1\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl1); - if (verbose) printf("END test 1\n"); + if (verbose) fprintf(pysam_stdout, "END test 1\n"); // setup - if (verbose) printf("BEGIN test 2\n"); // RG exists and translate test + if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // RG exists and translate test trans_tbl_t tbl2; setup_test_2(&b,&tbl2); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } - if (verbose) printf("RUN test 2\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 2\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bam_translate(b, &tbl2); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 2\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } @@ -428,33 +430,33 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 2\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 2\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl2); - if (verbose) printf("END test 2\n"); + if (verbose) fprintf(pysam_stdout, "END test 2\n"); - if (verbose) printf("BEGIN test 3\n"); // PG exists and translate test + if (verbose) fprintf(pysam_stdout, "BEGIN test 3\n"); // PG exists and translate test // setup trans_tbl_t tbl3; setup_test_3(&b,&tbl3); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } - if (verbose) printf("RUN test 3\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 3\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bam_translate(b, &tbl3); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 3\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 3\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } @@ -466,33 +468,33 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 3\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 3\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl3); - if (verbose) printf("END test 3\n"); + if (verbose) fprintf(pysam_stdout, "END test 3\n"); - if (verbose) printf("BEGIN test 4\n"); // RG test non-existent + if (verbose) fprintf(pysam_stdout, "BEGIN test 4\n"); // RG test non-existent // setup trans_tbl_t tbl4; setup_test_4(&b,&tbl4); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } - if (verbose) printf("RUN test 4\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 4\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bam_translate(b, &tbl4); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 4\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 4\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } // check result @@ -503,32 +505,32 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 4\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 4\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl4); - if (verbose) printf("END test 4\n"); + if (verbose) fprintf(pysam_stdout, "END test 4\n"); - if (verbose) printf("BEGIN test 5\n"); // PG test non-existent + if (verbose) fprintf(pysam_stdout, "BEGIN test 5\n"); // PG test non-existent // setup trans_tbl_t tbl5; setup_test_5(&b,&tbl5); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); - printf("RUN test 5\n"); + fprintf(pysam_stdout, "RUN test 5\n"); } // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bam_translate(b, &tbl5); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 5\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 5\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } @@ -540,33 +542,33 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 5\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 5\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl5); - if (verbose) printf("END test 5\n"); + if (verbose) fprintf(pysam_stdout, "END test 5\n"); - if (verbose) printf("BEGIN test 6\n"); // RG and PG exists and translate test + if (verbose) fprintf(pysam_stdout, "BEGIN test 6\n"); // RG and PG exists and translate test // setup trans_tbl_t tbl6; setup_test_6(&b,&tbl6); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } - if (verbose) printf("RUN test 6\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 6\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bam_translate(b, &tbl6); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 6\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 6\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_read(b); } @@ -578,21 +580,21 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 6\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 6\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl6); - if (verbose) printf("END test 6\n"); + if (verbose) fprintf(pysam_stdout, "END test 6\n"); // Cleanup free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success); - fclose(orig_pysamerr); + fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_pysam_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/merge/test_rtrans_build.c b/samtools/test/merge/test_rtrans_build.c index df50921..0f23b48 100644 --- a/samtools/test/merge/test_rtrans_build.c +++ b/samtools/test/merge/test_rtrans_build.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_sort.c" void dump_rtrans(int* rtrans, int n, int n_targets) { diff --git a/samtools/test/merge/test_rtrans_build.c.pysam.c b/samtools/test/merge/test_rtrans_build.c.pysam.c index fcbc458..0ac1367 100644 --- a/samtools/test/merge/test_rtrans_build.c.pysam.c +++ b/samtools/test/merge/test_rtrans_build.c.pysam.c @@ -24,16 +24,18 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_sort.c" void dump_rtrans(int* rtrans, int n, int n_targets) { - printf("->n_targets:(%d)\n", n_targets); + fprintf(pysam_stdout, "->n_targets:(%d)\n", n_targets); int i, j; for (i = 0; i < n; ++i) { - fprintf(pysamerr, "%d",rtrans[i*n_targets+0]); + fprintf(pysam_stderr, "%d",rtrans[i*n_targets+0]); for (j = 1; j < n_targets; ++j) - fprintf(pysamerr, "\t%d",rtrans[i*n_targets+j]); - fprintf(pysamerr, "\n"); + fprintf(pysam_stderr, "\t%d",rtrans[i*n_targets+j]); + fprintf(pysam_stderr, "\n"); } } @@ -62,7 +64,7 @@ bool check_test_1(trans_tbl_t* tbl, int* rtrans) { } -int main(int argc, char**argv) +int samtools_test_rtrans_build_main(int argc, char**argv) { const int NUM_TESTS = 1; int verbose = 0; @@ -81,7 +83,7 @@ int main(int argc, char**argv) const long GIMMICK_SEED = 0x1234330e; srand48(GIMMICK_SEED); - if (verbose) printf("BEGIN test 1\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // setup trans_tbl_t tbl_1[2]; int n_targets_1 = 3; @@ -92,29 +94,29 @@ int main(int argc, char**argv) if (verbose > 1) { // dump_trans_tid } - if (verbose) printf("RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); rtrans_1 = rtrans_build(n_1, n_targets_1, &tbl_1[0]); - if (verbose) printf("END RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - printf("rtrans\n"); + fprintf(pysam_stdout, "rtrans\n"); dump_rtrans(rtrans_1, n_1, n_targets_1); } if (check_test_1(&tbl_1[0], rtrans_1)) { ++success; } else { ++failure; - if (verbose) printf("FAIL test 1\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); } // teardown trans_tbl_destroy(&tbl_1[0]); trans_tbl_destroy(&tbl_1[1]); free(rtrans_1); - if (verbose) printf("END test 1\n"); + if (verbose) fprintf(pysam_stdout, "END test 1\n"); if (success == NUM_TESTS) { return 0; } else { - fprintf(pysamerr, "%d failures %d successes\n", failure, success); + fprintf(pysam_stderr, "%d failures %d successes\n", failure, success); return 1; } } diff --git a/samtools/test/merge/test_trans_tbl_init.c b/samtools/test/merge/test_trans_tbl_init.c index b1164a3..d557932 100644 --- a/samtools/test/merge/test_trans_tbl_init.c +++ b/samtools/test/merge/test_trans_tbl_init.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_sort.c" #include #include @@ -47,7 +49,7 @@ void dump_header(bam_hdr_t* hdr) { static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { trans_tbl_t dummy; int res; - res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, NULL); + res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL); trans_tbl_destroy(&dummy); return res; } @@ -359,7 +361,7 @@ int main(int argc, char**argv) dump_header(translate); } if (verbose) printf("RUN test 1\n"); - trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, NULL); + trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); if (verbose) printf("END RUN test 1\n"); @@ -396,7 +398,7 @@ int main(int argc, char**argv) dump_header(translate); } if (verbose) printf("RUN test 2\n"); - trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, NULL); + trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); if (verbose) printf("END RUN test 2\n"); @@ -432,7 +434,7 @@ int main(int argc, char**argv) dump_header(translate); } if (verbose) printf("RUN test 3\n"); - trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, NULL); + trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); if (verbose) printf("END RUN test 3\n"); @@ -468,7 +470,7 @@ int main(int argc, char**argv) dump_header(translate); } if (verbose) printf("RUN test 4\n"); - trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, NULL); + trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); if (verbose) printf("END RUN test 4\n"); @@ -505,7 +507,7 @@ int main(int argc, char**argv) dump_header(translate); } if (verbose) printf("RUN test 5\n"); - trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, NULL); + trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); if (verbose) printf("END RUN test 5\n"); @@ -541,7 +543,7 @@ int main(int argc, char**argv) dump_header(translate); } if (verbose) printf("RUN test 6\n"); - trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, "filename"); + trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); if (verbose) printf("END RUN test 6\n"); diff --git a/samtools/test/merge/test_trans_tbl_init.c.pysam.c b/samtools/test/merge/test_trans_tbl_init.c.pysam.c index 0f54989..af8af43 100644 --- a/samtools/test/merge/test_trans_tbl_init.c.pysam.c +++ b/samtools/test/merge/test_trans_tbl_init.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_sort.c" #include #include @@ -34,22 +36,22 @@ typedef struct refseq_info { } refseq_info_t; void dump_header(bam_hdr_t* hdr) { - printf("->n_targets:(%d)\n", hdr->n_targets); + fprintf(pysam_stdout, "->n_targets:(%d)\n", hdr->n_targets); int i; for (i = 0; i < hdr->n_targets; ++i) { - printf("->target_name[%d]:(%s)\n",i,hdr->target_name[i]); - printf("->target_len[%d]:(%d)\n",i,hdr->target_len[i]); + fprintf(pysam_stdout, "->target_name[%d]:(%s)\n",i,hdr->target_name[i]); + fprintf(pysam_stdout, "->target_len[%d]:(%d)\n",i,hdr->target_len[i]); } - printf("->text:("); - fwrite((void*)hdr->text, (size_t) hdr->l_text, 1, stdout); - printf(")\n"); + fprintf(pysam_stdout, "->text:("); + fwrite((void*)hdr->text, (size_t) hdr->l_text, 1, pysam_stdout); + fprintf(pysam_stdout, ")\n"); } static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { trans_tbl_t dummy; int res; - res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, NULL); + res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL); trans_tbl_destroy(&dummy); return res; } @@ -325,7 +327,7 @@ bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { return true; } -int main(int argc, char**argv) +int samtools_test_trans_tbl_init_main(int argc, char**argv) { const int NUM_TESTS = 6; int verbose = 0; @@ -349,7 +351,7 @@ int main(int argc, char**argv) bam_hdr_t* out; bam_hdr_t* translate; - if (verbose) printf("BEGIN test 1\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // setup trans_tbl_t tbl_1; merged_header_t *merged_hdr = init_merged_header(); @@ -357,36 +359,36 @@ int main(int argc, char**argv) assert(translate); // test if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); } - if (verbose) printf("RUN test 1\n"); - trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, NULL); + if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); - printf("out\n"); + fprintf(pysam_stdout, "out\n"); dump_header(out); } if (check_test_1(translate, out, &tbl_1)) { - if (verbose) printf("Test 1 : PASS\n"); + if (verbose) fprintf(pysam_stdout, "Test 1 : PASS\n"); ++success; } else { - if (verbose) printf("Test 1 : FAIL\n"); - fprintf(pysamerr, "Test 1 : FAIL\n"); + if (verbose) fprintf(pysam_stdout, "Test 1 : FAIL\n"); + fprintf(pysam_stderr, "Test 1 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_1); - if (verbose) printf("END test 1\n"); + if (verbose) fprintf(pysam_stdout, "END test 1\n"); // test - if (verbose) printf("BEGIN test 2\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // reinit trans_tbl_t tbl_2; @@ -394,108 +396,108 @@ int main(int argc, char**argv) translate = setup_test_2(merged_hdr); assert(translate); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); } - if (verbose) printf("RUN test 2\n"); - trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, NULL); + if (verbose) fprintf(pysam_stdout, "RUN test 2\n"); + trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 2\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); - printf("out\n"); + fprintf(pysam_stdout, "out\n"); dump_header(out); } if (check_test_2(translate, out, &tbl_2)) { - if (verbose) printf("Test 2 : PASS\n"); + if (verbose) fprintf(pysam_stdout, "Test 2 : PASS\n"); ++success; } else { - if (verbose) printf("Test 2 : FAIL\n"); - fprintf(pysamerr, "Test 2 : FAIL\n"); + if (verbose) fprintf(pysam_stdout, "Test 2 : FAIL\n"); + fprintf(pysam_stderr, "Test 2 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_2); - if (verbose) printf("END test 2\n"); + if (verbose) fprintf(pysam_stdout, "END test 2\n"); // test - if (verbose) printf("BEGIN test 3\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 3\n"); // reinit trans_tbl_t tbl_3; merged_hdr = init_merged_header(); translate = setup_test_3(merged_hdr); assert(translate); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); } - if (verbose) printf("RUN test 3\n"); - trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, NULL); + if (verbose) fprintf(pysam_stdout, "RUN test 3\n"); + trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 3\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 3\n"); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); - printf("out\n"); + fprintf(pysam_stdout, "out\n"); dump_header(out); } if (check_test_3(translate, out, &tbl_3)) { - if (verbose) printf("Test 3 : PASS\n"); + if (verbose) fprintf(pysam_stdout, "Test 3 : PASS\n"); ++success; } else { - if (verbose) printf("Test 3 : FAIL\n"); - fprintf(pysamerr, "Test 3 : FAIL\n"); + if (verbose) fprintf(pysam_stdout, "Test 3 : FAIL\n"); + fprintf(pysam_stderr, "Test 3 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_3); - if (verbose) printf("END test 3\n"); + if (verbose) fprintf(pysam_stdout, "END test 3\n"); // test - if (verbose) printf("BEGIN test 4\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 4\n"); // reinit trans_tbl_t tbl_4; merged_hdr = init_merged_header(); translate = setup_test_4(merged_hdr); assert(translate); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); } - if (verbose) printf("RUN test 4\n"); - trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, NULL); + if (verbose) fprintf(pysam_stdout, "RUN test 4\n"); + trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 4\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 4\n"); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); - printf("out\n"); + fprintf(pysam_stdout, "out\n"); dump_header(out); } if (check_test_4(translate, out, &tbl_4)) { - if (verbose) printf("Test 4 : PASS\n"); + if (verbose) fprintf(pysam_stdout, "Test 4 : PASS\n"); ++success; } else { - if (verbose) printf("Test 4 : FAIL\n"); - fprintf(pysamerr, "Test 4 : FAIL\n"); + if (verbose) fprintf(pysam_stdout, "Test 4 : FAIL\n"); + fprintf(pysam_stderr, "Test 4 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_4); - if (verbose) printf("END test 4\n"); + if (verbose) fprintf(pysam_stdout, "END test 4\n"); // test - if (verbose) printf("BEGIN test 5\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 5\n"); // reinit trans_tbl_t tbl_5; merged_hdr = init_merged_header(); @@ -503,74 +505,74 @@ int main(int argc, char**argv) assert(translate); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); } - if (verbose) printf("RUN test 5\n"); - trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, NULL); + if (verbose) fprintf(pysam_stdout, "RUN test 5\n"); + trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 5\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 5\n"); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); - printf("out\n"); + fprintf(pysam_stdout, "out\n"); dump_header(out); } if (check_test_5(translate, out, &tbl_5)) { - if (verbose) printf("Test 5 : PASS\n"); + if (verbose) fprintf(pysam_stdout, "Test 5 : PASS\n"); ++success; } else { - if (verbose) printf("Test 5 : FAIL\n"); - fprintf(pysamerr, "Test 5 : FAIL\n"); + if (verbose) fprintf(pysam_stdout, "Test 5 : FAIL\n"); + fprintf(pysam_stderr, "Test 5 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_5); - if (verbose) printf("END test 5\n"); + if (verbose) fprintf(pysam_stdout, "END test 5\n"); // test - if (verbose) printf("BEGIN test 6\n"); + if (verbose) fprintf(pysam_stdout, "BEGIN test 6\n"); // reinit trans_tbl_t tbl_6; merged_hdr = init_merged_header(); translate = setup_test_6(merged_hdr); assert(translate); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); } - if (verbose) printf("RUN test 6\n"); - trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, "filename"); + if (verbose) fprintf(pysam_stdout, "RUN test 6\n"); + trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 6\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 6\n"); if (verbose > 1) { - printf("translate\n"); + fprintf(pysam_stdout, "translate\n"); dump_header(translate); - printf("out\n"); + fprintf(pysam_stdout, "out\n"); dump_header(out); } if (check_test_6(translate, out, &tbl_6)) { - if (verbose) printf("Test 6 : PASS\n"); + if (verbose) fprintf(pysam_stdout, "Test 6 : PASS\n"); ++success; } else { - if (verbose) printf("Test 6 : FAIL\n"); - fprintf(pysamerr, "Test 6 : FAIL\n"); + if (verbose) fprintf(pysam_stdout, "Test 6 : FAIL\n"); + fprintf(pysam_stderr, "Test 6 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_6); - if (verbose) printf("END test 6\n"); + if (verbose) fprintf(pysam_stdout, "END test 6\n"); if (success == NUM_TESTS) { return 0; } else { - fprintf(pysamerr, "%d failures %d successes\n", failure, success); + fprintf(pysam_stderr, "%d failures %d successes\n", failure, success); return 1; } } diff --git a/samtools/test/split/test_count_rg.c b/samtools/test/split/test_count_rg.c index 97512a8..4038f97 100644 --- a/samtools/test/split/test_count_rg.c +++ b/samtools/test/split/test_count_rg.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include diff --git a/samtools/test/split/test_count_rg.c.pysam.c b/samtools/test/split/test_count_rg.c.pysam.c index eda8abb..25131a8 100644 --- a/samtools/test/split/test_count_rg.c.pysam.c +++ b/samtools/test/split/test_count_rg.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include @@ -40,7 +42,7 @@ void setup_test_1(bam_hdr_t** hdr_in) (*hdr_in)->l_text = strlen(test1); } -int main(int argc, char**argv) +int samtools_test_count_rg_main(int argc, char**argv) { // test state const int NUM_TESTS = 1; @@ -55,7 +57,7 @@ int main(int argc, char**argv) ++verbose; break; default: - printf( + fprintf(pysam_stdout, "usage: test_count_rg [-v]\n\n" " -v verbose output\n" ); @@ -64,32 +66,32 @@ int main(int argc, char**argv) } - // Setup pysamerr redirect + // Setup pysam_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr + FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; FILE* check = NULL; // setup - if (verbose) printf("BEGIN test 1\n"); // TID test + if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // TID test bam_hdr_t* hdr1; size_t count; char** output; setup_test_1(&hdr1); if (verbose > 1) { - printf("hdr1\n"); + fprintf(pysam_stdout, "hdr1\n"); dump_hdr(hdr1); } - if (verbose) printf("RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bool result_1 = count_RG(hdr1, &count, &output); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - printf("b\n"); + fprintf(pysam_stdout, "b\n"); dump_hdr(hdr1); } @@ -101,7 +103,7 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 1\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); } fclose(check); @@ -112,14 +114,14 @@ int main(int argc, char**argv) } free(output); bam_hdr_destroy(hdr1); - if (verbose) printf("END test 1\n"); + if (verbose) fprintf(pysam_stdout, "END test 1\n"); // Cleanup free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success); - fclose(orig_pysamerr); + fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_pysam_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/split/test_expand_format_string.c b/samtools/test/split/test_expand_format_string.c index ede7586..7c90b62 100644 --- a/samtools/test/split/test_expand_format_string.c +++ b/samtools/test/split/test_expand_format_string.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include diff --git a/samtools/test/split/test_expand_format_string.c.pysam.c b/samtools/test/split/test_expand_format_string.c.pysam.c index 94e7732..fe9a426 100644 --- a/samtools/test/split/test_expand_format_string.c.pysam.c +++ b/samtools/test/split/test_expand_format_string.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include @@ -40,7 +42,7 @@ void setup_test_1(bam_hdr_t** hdr_in) (*hdr_in)->l_text = strlen(test1); } -int main(int argc, char**argv) +int samtools_test_expand_format_string_main(int argc, char**argv) { // test state const int NUM_TESTS = 1; @@ -55,7 +57,7 @@ int main(int argc, char**argv) ++verbose; break; default: - printf( + fprintf(pysam_stdout, "usage: test_expand_format_string [-v]\n\n" " -v verbose output\n" ); @@ -64,34 +66,34 @@ int main(int argc, char**argv) } - // Setup pysamerr redirect + // Setup pysam_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr + FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr char* tempfname = (optind < argc)? argv[optind] : "test_expand_format_string.tmp"; FILE* check = NULL; // setup - if (verbose) printf("BEGIN test 1\n"); // default format string test + if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // default format string test const char* format_string_1 = "%*_%#.bam"; const char* basename_1 = "basename"; const char* rg_id_1 = "1#2.3"; const int rg_idx_1 = 4; if (verbose > 1) { - printf("format_string:%s\n" + fprintf(pysam_stdout, "format_string:%s\n" "basename:%s\n" "rg_id:%s\n" "rg_idx:%d\n", format_string_1, basename_1, rg_id_1, rg_idx_1); } - if (verbose) printf("RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1, NULL); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - printf("format_string:%s\n" + fprintf(pysam_stdout, "format_string:%s\n" "basename:%s\n" "rg_id:%s\n" "rg_idx:%d\n", format_string_1, basename_1, rg_id_1, rg_idx_1); @@ -106,20 +108,20 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 1\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); } fclose(check); // teardown free(output_1); - if (verbose) printf("END test 1\n"); + if (verbose) fprintf(pysam_stdout, "END test 1\n"); // Cleanup test harness free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success); - fclose(orig_pysamerr); + fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_pysam_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/split/test_filter_header_rg.c b/samtools/test/split/test_filter_header_rg.c index f4e1266..d9505d6 100644 --- a/samtools/test/split/test_filter_header_rg.c +++ b/samtools/test/split/test_filter_header_rg.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include diff --git a/samtools/test/split/test_filter_header_rg.c.pysam.c b/samtools/test/split/test_filter_header_rg.c.pysam.c index 4a5b6d5..97b3573 100644 --- a/samtools/test/split/test_filter_header_rg.c.pysam.c +++ b/samtools/test/split/test_filter_header_rg.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include @@ -73,7 +75,7 @@ bool check_test_2(const bam_hdr_t* hdr) { return true; } -int main(int argc, char**argv) +int samtools_test_filter_header_rg_main(int argc, char**argv) { // test state const int NUM_TESTS = 2; @@ -88,7 +90,7 @@ int main(int argc, char**argv) ++verbose; break; default: - printf( + fprintf(pysam_stdout, "usage: test_filter_header_rg [-v]\n\n" " -v verbose output\n" ); @@ -97,31 +99,31 @@ int main(int argc, char**argv) } - // Setup pysamerr redirect + // Setup pysam_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr + FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; FILE* check = NULL; // setup - if (verbose) printf("BEGIN test 1\n"); // test eliminating a tag that isn't there + if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // test eliminating a tag that isn't there bam_hdr_t* hdr1; const char* id_to_keep_1 = "1#2.3"; setup_test_1(&hdr1); if (verbose > 1) { - printf("hdr1\n"); + fprintf(pysam_stdout, "hdr1\n"); dump_hdr(hdr1); } - if (verbose) printf("RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bool result_1 = filter_header_rg(hdr1, id_to_keep_1); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 1\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - printf("hdr1\n"); + fprintf(pysam_stdout, "hdr1\n"); dump_hdr(hdr1); } @@ -135,32 +137,32 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 1\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); } fclose(check); // teardown bam_hdr_destroy(hdr1); - if (verbose) printf("END test 1\n"); + if (verbose) fprintf(pysam_stdout, "END test 1\n"); - if (verbose) printf("BEGIN test 2\n"); // test eliminating a tag that is there + if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there bam_hdr_t* hdr2; const char* id_to_keep_2 = "fish"; setup_test_2(&hdr2); if (verbose > 1) { - printf("hdr2\n"); + fprintf(pysam_stdout, "hdr2\n"); dump_hdr(hdr2); } - if (verbose) printf("RUN test 2\n"); + if (verbose) fprintf(pysam_stdout, "RUN test 2\n"); // test - xfreopen(tempfname, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe bool result_2 = filter_header_rg(hdr2, id_to_keep_2); - fclose(pysamerr); + fclose(pysam_stderr); - if (verbose) printf("END RUN test 2\n"); + if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); if (verbose > 1) { - printf("hdr2\n"); + fprintf(pysam_stdout, "hdr2\n"); dump_hdr(hdr2); } @@ -174,21 +176,21 @@ int main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) printf("FAIL test 2\n"); + if (verbose) fprintf(pysam_stdout, "FAIL test 2\n"); } fclose(check); // teardown bam_hdr_destroy(hdr2); - if (verbose) printf("END test 2\n"); + if (verbose) fprintf(pysam_stdout, "END test 2\n"); // Cleanup free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success); - fclose(orig_pysamerr); + fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_pysam_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/split/test_parse_args.c b/samtools/test/split/test_parse_args.c index 66c7c88..85a196a 100644 --- a/samtools/test/split/test_parse_args.c +++ b/samtools/test/split/test_parse_args.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include diff --git a/samtools/test/split/test_parse_args.c.pysam.c b/samtools/test/split/test_parse_args.c.pysam.c index 608ec7c..2c3e749 100644 --- a/samtools/test/split/test_parse_args.c.pysam.c +++ b/samtools/test/split/test_parse_args.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_split.c" #include "../test.h" #include @@ -65,7 +67,7 @@ bool check_test_2(const parsed_opts_t* opts) { return true; } -int main(int argc, char**argv) +int samtools_test_parse_args_main(int argc, char**argv) { // test state const int NUM_TESTS = 2; @@ -80,7 +82,7 @@ int main(int argc, char**argv) ++verbose; break; default: - printf( + fprintf(pysam_stdout, "usage: test_parse_args [-v]\n\n" " -v verbose output\n" ); @@ -88,58 +90,58 @@ int main(int argc, char**argv) } } - // Setup stdout and pysamerr redirect - kstring_t res_stdout = { 0, 0, NULL }; - kstring_t res_pysamerr = { 0, 0, NULL }; - FILE* orig_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save pysamerr - FILE* orig_pysamerr = fdopen(dup(STDERR_FILENO), "a"); // Save pysamerr - char* tempfname_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o"; - char* tempfname_pysamerr = (optind < argc)? argv[optind] : "test_parse_args.tmp.e"; - FILE* check_stdout = NULL; - FILE* check_pysamerr = NULL; + // Setup pysam_stdout and pysam_stderr redirect + kstring_t res_pysam_stdout = { 0, 0, NULL }; + kstring_t res_pysam_stderr = { 0, 0, NULL }; + FILE* orig_pysam_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save pysam_stderr + FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr + char* tempfname_pysam_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o"; + char* tempfname_pysam_stderr = (optind < argc)? argv[optind] : "test_parse_args.tmp.e"; + FILE* check_pysam_stdout = NULL; + FILE* check_pysam_stderr = NULL; // Cleanup getopt optind = 1; // setup - if (verbose) fprintf(orig_stdout,"BEGIN test 1\n"); // test eliminating a tag that isn't there + if (verbose) fprintf(orig_pysam_stdout,"BEGIN test 1\n"); // test eliminating a tag that isn't there int argc_1; char** argv_1; setup_test_1(&argc_1, &argv_1); if (verbose > 1) { - fprintf(orig_stdout, "argc: %d\n", argc_1); + fprintf(orig_pysam_stdout, "argc: %d\n", argc_1); } - if (verbose) fprintf(orig_stdout,"RUN test 1\n"); + if (verbose) fprintf(orig_pysam_stdout,"RUN test 1\n"); // test - xfreopen(tempfname_stdout, "w", stdout); // Redirect stdout to pipe - xfreopen(tempfname_pysamerr, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname_pysam_stdout, "w", pysam_stdout); // Redirect pysam_stdout to pipe + xfreopen(tempfname_pysam_stderr, "w", pysam_stderr); // Redirect pysam_stderr to pipe parsed_opts_t* result_1 = parse_args(argc_1, argv_1); - fclose(stdout); - fclose(pysamerr); + fclose(pysam_stdout); + fclose(pysam_stderr); - if (verbose) fprintf(orig_stdout, "END RUN test 1\n"); + if (verbose) fprintf(orig_pysam_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(orig_stdout, "argc: %d\n", argc_1); + fprintf(orig_pysam_stdout, "argc: %d\n", argc_1); } // check result - res_stdout.l = res_pysamerr.l = 0; - check_stdout = fopen(tempfname_stdout, "r"); - check_pysamerr = fopen(tempfname_pysamerr, "r"); + res_pysam_stdout.l = res_pysam_stderr.l = 0; + check_pysam_stdout = fopen(tempfname_pysam_stdout, "r"); + check_pysam_stderr = fopen(tempfname_pysam_stderr, "r"); if ( !result_1 - && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) >= 0 - && !feof(check_stdout) - && res_stdout.l > 0 - && kgetline(&res_pysamerr, (kgets_func *)fgets, check_pysamerr) < 0 - && (feof(check_pysamerr) || res_pysamerr.l == 0)) { + && kgetline(&res_pysam_stdout, (kgets_func *)fgets, check_pysam_stdout) >= 0 + && !feof(check_pysam_stdout) + && res_pysam_stdout.l > 0 + && kgetline(&res_pysam_stderr, (kgets_func *)fgets, check_pysam_stderr) < 0 + && (feof(check_pysam_stderr) || res_pysam_stderr.l == 0)) { ++success; } else { ++failure; - if (verbose) fprintf(orig_stdout, "FAIL test 1\n"); + if (verbose) fprintf(orig_pysam_stdout, "FAIL test 1\n"); } - fclose(check_pysamerr); - fclose(check_stdout); + fclose(check_pysam_stderr); + fclose(check_pysam_stdout); // teardown cleanup_opts(result_1); @@ -148,49 +150,49 @@ int main(int argc, char**argv) free(argv_1[i]); } free(argv_1); - if (verbose) fprintf(orig_stdout, "END test 1\n"); + if (verbose) fprintf(orig_pysam_stdout, "END test 1\n"); // Cleanup getopt optind = 1; - if (verbose) fprintf(orig_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there + if (verbose) fprintf(orig_pysam_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there int argc_2; char** argv_2; setup_test_2(&argc_2, &argv_2); if (verbose > 1) { - fprintf(orig_stdout, "argc: %d\n", argc_2); + fprintf(orig_pysam_stdout, "argc: %d\n", argc_2); } - if (verbose) fprintf(orig_stdout, "RUN test 2\n"); + if (verbose) fprintf(orig_pysam_stdout, "RUN test 2\n"); // test - xfreopen(tempfname_stdout, "w", stdout); // Redirect stdout to pipe - xfreopen(tempfname_pysamerr, "w", pysamerr); // Redirect pysamerr to pipe + xfreopen(tempfname_pysam_stdout, "w", pysam_stdout); // Redirect pysam_stdout to pipe + xfreopen(tempfname_pysam_stderr, "w", pysam_stderr); // Redirect pysam_stderr to pipe parsed_opts_t* result_2 = parse_args(argc_2, argv_2); - fclose(stdout); - fclose(pysamerr); + fclose(pysam_stdout); + fclose(pysam_stderr); - if (verbose) fprintf(orig_stdout, "END RUN test 2\n"); + if (verbose) fprintf(orig_pysam_stdout, "END RUN test 2\n"); if (verbose > 1) { - fprintf(orig_stdout, "argc: %d\n", argc_2); + fprintf(orig_pysam_stdout, "argc: %d\n", argc_2); } // check result - res_stdout.l = res_pysamerr.l = 0; - check_stdout = fopen(tempfname_stdout, "r"); - check_pysamerr = fopen(tempfname_pysamerr, "r"); + res_pysam_stdout.l = res_pysam_stderr.l = 0; + check_pysam_stdout = fopen(tempfname_pysam_stdout, "r"); + check_pysam_stderr = fopen(tempfname_pysam_stderr, "r"); if ( result_2 && check_test_2(result_2) - && kgetline(&res_stdout, (kgets_func *)fgets, check_stdout) < 0 - && (feof(check_stdout) || res_stdout.l == 0) - && kgetline(&res_pysamerr, (kgets_func *)fgets, check_pysamerr) < 0 - && (feof(check_pysamerr) || res_pysamerr.l == 0)) { + && kgetline(&res_pysam_stdout, (kgets_func *)fgets, check_pysam_stdout) < 0 + && (feof(check_pysam_stdout) || res_pysam_stdout.l == 0) + && kgetline(&res_pysam_stderr, (kgets_func *)fgets, check_pysam_stderr) < 0 + && (feof(check_pysam_stderr) || res_pysam_stderr.l == 0)) { ++success; } else { ++failure; - if (verbose) fprintf(orig_stdout, "FAIL test 2\n"); + if (verbose) fprintf(orig_pysam_stdout, "FAIL test 2\n"); } - fclose(check_stdout); - fclose(check_pysamerr); + fclose(check_pysam_stdout); + fclose(check_pysam_stderr); // teardown cleanup_opts(result_2); @@ -200,18 +202,18 @@ int main(int argc, char**argv) } free(argv_2); - if (verbose) fprintf(orig_stdout, "END test 2\n"); + if (verbose) fprintf(orig_pysam_stdout, "END test 2\n"); // Cleanup - free(res_stdout.s); - free(res_pysamerr.s); - remove(tempfname_stdout); - remove(tempfname_pysamerr); - fclose(orig_stdout); + free(res_pysam_stdout.s); + free(res_pysam_stderr.s); + remove(tempfname_pysam_stdout); + remove(tempfname_pysam_stderr); + fclose(orig_pysam_stdout); if (failure > 0) - fprintf(orig_pysamerr, "%d failures %d successes\n", failure, success); - fclose(orig_pysamerr); + fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_pysam_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/test.c b/samtools/test/test.c index ef1d1f9..7ab38af 100644 --- a/samtools/test/test.c +++ b/samtools/test/test.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include diff --git a/samtools/test/test.c.pysam.c b/samtools/test/test.c.pysam.c index 735eb7b..a8295b5 100644 --- a/samtools/test/test.c.pysam.c +++ b/samtools/test/test.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -35,7 +37,7 @@ DEALINGS IN THE SOFTWARE. */ void xfreopen(const char *path, const char *mode, FILE *stream) { if (freopen(path, mode, stream) == NULL) { - fprintf(pysamerr, __FILE__": error reopening %s: %s\n", + fprintf(pysam_stderr, __FILE__": error reopening %s: %s\n", path, strerror(errno)); exit(2); } @@ -43,13 +45,13 @@ void xfreopen(const char *path, const char *mode, FILE *stream) void dump_hdr(const bam_hdr_t* hdr) { - printf("n_targets: %d\n", hdr->n_targets); - printf("ignore_sam_err: %d\n", hdr->ignore_sam_err); - printf("l_text: %u\n", hdr->l_text); - printf("idx\ttarget_len\ttarget_name:\n"); + fprintf(pysam_stdout, "n_targets: %d\n", hdr->n_targets); + fprintf(pysam_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err); + fprintf(pysam_stdout, "l_text: %u\n", hdr->l_text); + fprintf(pysam_stdout, "idx\ttarget_len\ttarget_name:\n"); int32_t target; for (target = 0; target < hdr->n_targets; ++target) { - printf("%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); + fprintf(pysam_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); } - printf("text: \"%s\"\n", hdr->text); + fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text); } diff --git a/samtools/test/tview/test_get_rg_sample.c b/samtools/test/tview/test_get_rg_sample.c index c22ba9d..3db9da2 100644 --- a/samtools/test/tview/test_get_rg_sample.c +++ b/samtools/test/tview/test_get_rg_sample.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_tview.c" #include diff --git a/samtools/test/tview/test_get_rg_sample.c.pysam.c b/samtools/test/tview/test_get_rg_sample.c.pysam.c index 99a217f..8c441f9 100644 --- a/samtools/test/tview/test_get_rg_sample.c.pysam.c +++ b/samtools/test/tview/test_get_rg_sample.c.pysam.c @@ -24,6 +24,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "../../bam_tview.c" #include @@ -59,7 +61,7 @@ void teardown_1(khash_t(kh_rg)* test_result, char* header) free(header); } -int main(int argc, char** argv) +int samtools_test_get_rg_sample_main(int argc, char** argv) { const int NUM_TESTS = 1; int success = 0; @@ -77,7 +79,7 @@ int main(int argc, char** argv) if (success == NUM_TESTS) { return 0; } else { - fprintf(pysamerr, "%d failures %d successes\n", failure, success); + fprintf(pysam_stderr, "%d failures %d successes\n", failure, success); return 1; } } diff --git a/samtools/version.h b/samtools/version.h index abe052c..ec46e67 100644 --- a/samtools/version.h +++ b/samtools/version.h @@ -1 +1 @@ -#define SAMTOOLS_VERSION "1.3" +#define SAMTOOLS_VERSION "1.3.1" diff --git a/setup.py b/setup.py index 7b59b69..080bc24 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ This module provides a low-level wrapper around the htslib C-API as using cython and a high-level API for convenient access to the data within standard genomic file formats. -The current version wraps htslib-1.3, samtools-1.3 and bcftools-1.3. +The current version wraps htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1. See: http://www.htslib.org @@ -155,7 +155,7 @@ if HTSLIB_MODE in ['shared', 'separate']: outf.write( "/* empty config.h created by pysam */\n") outf.write( - "/* conservative compilation options */") + "/* conservative compilation options */\n") if HTSLIB_LIBRARY_DIR: # linking against a shared, externally installed htslib version, no @@ -259,6 +259,16 @@ if HTSLIB_SOURCE == "builtin": "adding shared libcurl and libcrypto") external_htslib_libraries.extend(["curl", "crypto"]) +# create empty config.h files if they have not been created automatically +# or created by the user: +for fn in "samtools/config.h", "htslib/config.h": + if not os.path.exists(fn): + with open(fn, "w") as outf: + outf.write( + "/* empty config.h created by pysam */\n") + outf.write( + "/* conservative compilation options */\n") + parts = ["samtools", "bcftools", "htslib", @@ -271,15 +281,6 @@ parts = ["samtools", "vcf", "bcf"] -# remove existing files to recompute -# necessary to be both compatible for python 2.7 and 3.3 -if IS_PYTHON3: - for part in parts: - try: - os.unlink("pysam/c%s.c" % part) - except: - pass - # Exit if there are no pre-compiled files and no cython available fn = source_pattern % "htslib" if not os.path.exists(fn): @@ -449,7 +450,7 @@ ctabixproxies = Extension( "pysam.ctabixproxies", [source_pattern % "tabixproxies"] + os_c_files, - library_dirs=[], + library_dirs=htslib_library_dirs, include_dirs=include_os, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", @@ -461,7 +462,7 @@ cvcf = Extension( "pysam.cvcf", [source_pattern % "vcf"] + os_c_files, - library_dirs=[], + library_dirs=htslib_library_dirs, include_dirs=["htslib", "."] + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 5995faa..94b2eb3 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -3,6 +3,7 @@ import pysam import unittest import collections import copy +import array from TestUtils import checkFieldEqual @@ -319,7 +320,7 @@ class TestAlignedSegment(ReadTest): (None, 25, 'T'), (None, 26, 'T'), (5, 27, 'A'), (6, 28, 'A'), (7, 29, 'A'), (8, 30, 'A')] ) - + a.cigarstring = "5M2D2I2M" a.set_tag("MD", "4C^TT2") self.assertEqual( @@ -331,6 +332,34 @@ class TestAlignedSegment(ReadTest): (7, 27, 'A'), (8, 28, 'A')] ) + def test_get_aligned_pairs_skip_reference(self): + a = self.buildRead() + a.query_sequence = "A" * 10 + a.cigarstring = "5M1N5M" + a.set_tag("MD", "10") + + self.assertEqual( + a.get_aligned_pairs(with_seq=True), + [(0, 20, 'A'), (1, 21, 'A'), (2, 22, 'A'), + (3, 23, 'A'), (4, 24, 'A'), (None, 25, None), + (5, 26, 'A'), (6, 27, 'A'), (7, 28, 'A'), + (8, 29, 'A'), (9, 30, 'A')]) + + self.assertEqual( + a.get_aligned_pairs(with_seq=False), + [(0, 20), (1, 21), (2, 22), + (3, 23), (4, 24), (None, 25), + (5, 26), (6, 27), (7, 28), + (8, 29), (9, 30)]) + + self.assertEqual( + a.get_aligned_pairs(matches_only=True, with_seq=False), + [(0, 20), (1, 21), + (2, 22), (3, 23), + (4, 24), (5, 26), + (6, 27), (7, 28), + (8, 29), (9, 30)]) + def testNoSequence(self): '''issue 176: retrieving length without query sequence with soft-clipping. @@ -347,13 +376,60 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.query_alignment_length, 20) +class TestCigarStats(ReadTest): + + def testStats(self): + + a = self.buildRead() + + a.cigarstring = None + self.assertEqual( + [list(x) for x in a.get_cigar_stats()], + [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + a.cigarstring = "10M" + self.assertEqual( + [list(x) for x in a.get_cigar_stats()], + [[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + a.cigarstring = "10M2I2M" + self.assertEqual( + [list(x) for x in a.get_cigar_stats()], + [[12, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + for i, x in enumerate("MIDNSHP=X"): + a.cigarstring = "2{}".format(x) + expected = [[0] * 11, [0] * 11] + expected[0][i] = 2 + expected[1][i] = 1 + self.assertEqual( + [list(x) for x in a.get_cigar_stats()], + expected) + + a.cigarstring = "10M" + a.set_tag("NM", 5) + self.assertEqual( + [list(x) for x in a.get_cigar_stats()], + [[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + a.cigarstring = None + self.assertEqual( + [list(x) for x in a.get_cigar_stats()], + [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + class TestAlignedPairs(unittest.TestCase): filename = os.path.join(DATADIR, "example_aligned_pairs.bam") def testReferenceBases(self): """reference bases should always be the same nucleotide """ - reference_bases = collections.defaultdict(list) + reference_bases = collections.defaultdict(list) with pysam.AlignmentFile(self.filename) as inf: for c in inf.pileup(): for r in c.pileups: @@ -389,7 +465,25 @@ class TestTags(ReadTest): self.assertEqual(False, a.has_tag("NM")) # check if deleting a non-existing tag is fine a.set_tag("NM", None) + a.set_tag("NM", None) + def testArrayTags(self): + read = self.buildRead() + supported_dtypes = "bhBHf" + unsupported_dtypes = "lLd" + + for dtype in supported_dtypes: + key = "F" + dtype + read.set_tag(key, array.array(dtype, range(10))) + ary = read.get_tag(key) + + for dtype in unsupported_dtypes: + key = "F" + dtype + self.assertRaises(ValueError, + read.set_tag, + key, + array.array(dtype, range(10))) + def testAddTagsType(self): a = self.buildRead() a.tags = None @@ -551,6 +645,23 @@ class TestTags(ReadTest): "A" * 5 + "C" * 3 + "A" * 5, a.get_reference_sequence()) + def testMDTagRefSkipping(self): + a = self.buildRead() + + a.cigarstring = "5M1N5M" + a.query_sequence = "A" * 10 + a.set_tag('MD', "10") + self.assertEqual( + "A" * 10, + a.get_reference_sequence()) + + a.cigarstring = "5M3N5M" + a.query_sequence = "A" * 10 + a.set_tag('MD', "10") + self.assertEqual( + "A" * 10, + a.get_reference_sequence()) + def testMDTagSoftClipping(self): a = self.buildRead() @@ -561,7 +672,7 @@ class TestTags(ReadTest): self.assertEqual( "A" * 5 + "C" + "A" * 5, a.get_reference_sequence()) - + # all together a.cigarstring = "5S5M1D5M1I5M5S" a.query_sequence = "G" * 5 + "A" * 16 + "G" * 5 @@ -579,7 +690,7 @@ class TestTags(ReadTest): self.assertEqual( "AAcAATCAAAAA", a.get_reference_sequence()) - + a.cigarstring = "5S5M2D1I5M5S" a.query_sequence = "G" * 5 + "A" * 11 + "G" * 5 a.set_tag('MD', "2C2^TC5") @@ -606,7 +717,7 @@ class TestTags(ReadTest): class TestCopy(ReadTest): - + def testCopy(self): a = self.buildRead() b = copy.copy(a) diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index c03e234..9a33722 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -23,7 +23,7 @@ from functools import partial import pysam import pysam.samtools from TestUtils import checkBinaryEqual, checkURL, \ - checkSamtoolsViewEqual, checkFieldEqual, force_str + check_samtools_view_equal, checkFieldEqual, force_str DATADIR = "pysam_data" @@ -49,6 +49,9 @@ class BasicTestBAMFromFetch(unittest.TestCase): "rb") self.reads = list(self.samfile.fetch()) + def tearDown(self): + self.samfile.close() + def testARqname(self): self.assertEqual( self.reads[0].query_name, @@ -261,9 +264,6 @@ class BasicTestBAMFromFetch(unittest.TestCase): self.assertEqual(self.reads[0].opt("XT"), "U") self.assertEqual(self.reads[1].opt("XT"), "R") - def tearDown(self): - self.samfile.close() - class BasicTestSAMFromFetch(BasicTestBAMFromFetch): @@ -426,42 +426,42 @@ class TestIO(unittest.TestCase): The *checkf* is used to determine if the files are equal. ''' - infile = pysam.AlignmentFile( - os.path.join(DATADIR, input_filename), - input_mode) - - if "b" in input_mode: - self.assertTrue(infile.is_bam) - self.assertFalse(infile.is_cram) - elif "c" in input_mode: - self.assertFalse(infile.is_bam) - self.assertTrue(infile.is_cram) - else: - self.assertFalse(infile.is_cram) - self.assertFalse(infile.is_bam) - - if use_template: - outfile = pysam.AlignmentFile( - output_filename, - output_mode, - reference_filename=sequence_filename, - template=infile) - else: - outfile = pysam.AlignmentFile( - output_filename, - output_mode, - reference_names=infile.references, - reference_lengths=infile.lengths, - reference_filename=sequence_filename, - add_sq_text=False) - iter = infile.fetch() + with pysam.AlignmentFile( + os.path.join(DATADIR, input_filename), + input_mode) as infile: + + if "b" in input_mode: + self.assertTrue(infile.is_bam) + self.assertFalse(infile.is_cram) + elif "c" in input_mode: + self.assertFalse(infile.is_bam) + self.assertTrue(infile.is_cram) + else: + self.assertFalse(infile.is_cram) + self.assertFalse(infile.is_bam) + + if use_template: + outfile = pysam.AlignmentFile( + output_filename, + output_mode, + reference_filename=sequence_filename, + template=infile) + else: + outfile = pysam.AlignmentFile( + output_filename, + output_mode, + reference_names=infile.references, + reference_lengths=infile.lengths, + reference_filename=sequence_filename, + add_sq_text=False) - for x in iter: - outfile.write(x) + iter = infile.fetch() - infile.close() - outfile.close() + for x in iter: + outfile.write(x) + + outfile.close() self.assertTrue(checkf( os.path.join(DATADIR, reference_filename), @@ -490,7 +490,7 @@ class TestIO(unittest.TestCase): "tmp_ex2.cram", "rc", "wc", sequence_filename="pysam_data/ex1.fa", - checkf=checkSamtoolsViewEqual) + checkf=check_samtools_view_equal) def testSAM2BAM(self): self.checkEcho("ex2.sam", @@ -512,7 +512,7 @@ class TestIO(unittest.TestCase): "rb", "wc", sequence_filename="pysam_data/ex1.fa", checkf=partial( - checkSamtoolsViewEqual, + check_samtools_view_equal, without_header=True)) def testCRAM2BAM(self): @@ -523,7 +523,7 @@ class TestIO(unittest.TestCase): "rc", "wb", sequence_filename="pysam_data/ex1.fa", checkf=partial( - checkSamtoolsViewEqual, + check_samtools_view_equal, without_header=True)) def testSAM2CRAM(self): @@ -533,7 +533,7 @@ class TestIO(unittest.TestCase): "r", "wc", sequence_filename="pysam_data/ex1.fa", checkf=partial( - checkSamtoolsViewEqual, + check_samtools_view_equal, without_header=True)) def testCRAM2SAM(self): @@ -543,7 +543,7 @@ class TestIO(unittest.TestCase): "rc", "wh", sequence_filename="pysam_data/ex1.fa", checkf=partial( - checkSamtoolsViewEqual, + check_samtools_view_equal, without_header=True)) # Disabled - should work, files are not binary equal, but are @@ -858,12 +858,18 @@ class TestIteratorRowBAM(unittest.TestCase): filename = os.path.join(DATADIR, "ex2.bam") mode = "rb" + reference_filename = None def setUp(self): self.samfile = pysam.AlignmentFile( - self.filename, self.mode, + self.filename, + self.mode, + reference_filename=self.reference_filename, ) + def tearDown(self): + self.samfile.close() + def checkRange(self, rnge): '''compare results from iterator with those from samtools.''' ps = list(self.samfile.fetch(region=rnge)) @@ -911,9 +917,6 @@ class TestIteratorRowBAM(unittest.TestCase): self.checkRange("%s:%i-%i" % (contig, start, start + 90)) - def tearDown(self): - self.samfile.close() - class TestIteratorRowAllBAM(unittest.TestCase): @@ -1034,9 +1037,9 @@ class TestIteratorRowCRAM(TestIteratorRowBAM): mode = "rc" -class TestIteratorRowCRAM(TestIteratorRowBAM): - filename = os.path.join(DATADIR, "ex2.cram") - mode = "rc" +class TestIteratorRowCRAMWithReferenceFilename(TestIteratorRowCRAM): + reference_filename = os.path.join(DATADIR, "ex1.fa") + ########################################################## ########################################################## @@ -1840,40 +1843,54 @@ class TestBTagBam(TestBTagSam): filename = os.path.join(DATADIR, 'example_btag.bam') -class TestDoubleFetch(unittest.TestCase): - +class TestDoubleFetchBAM(unittest.TestCase): '''check if two iterators on the same bamfile are independent.''' filename = os.path.join(DATADIR, 'ex1.bam') + mode = "rb" def testDoubleFetch(self): - samfile1 = pysam.AlignmentFile(self.filename, 'rb') - - for a, b in zip(samfile1.fetch(multiple_iterators=True), - samfile1.fetch(multiple_iterators=True)): - self.assertEqual(a.compare(b), 0) + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + for a, b in zip(samfile1.fetch(multiple_iterators=True), + samfile1.fetch(multiple_iterators=True)): + self.assertEqual(a.compare(b), 0) def testDoubleFetchWithRegion(self): - samfile1 = pysam.AlignmentFile(self.filename, 'rb') - chr, start, stop = 'chr1', 200, 3000000 - # just making sure the test has something to catch - self.assertTrue(len(list(samfile1.fetch(chr, start, stop))) > 0) + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + contig, start, stop = 'chr1', 200, 3000000 + # just making sure the test has something to catch + self.assertTrue(len(list(samfile1.fetch(contig, start, stop))) > 0) - for a, b in zip(samfile1.fetch(chr, start, stop), - samfile1.fetch(chr, start, stop, - multiple_iterators=True)): - self.assertEqual(a.compare(b), 0) + # see Issue #293 + # The following fails for CRAM files, but works for BAM + # files when the first is multiple_iterators=False: + for a, b in zip(samfile1.fetch(contig, start, stop, + multiple_iterators=True), + samfile1.fetch(contig, start, stop, + multiple_iterators=True)): + self.assertEqual(a.compare(b), 0) def testDoubleFetchUntilEOF(self): - samfile1 = pysam.AlignmentFile(self.filename, 'rb') + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + + for a, b in zip(samfile1.fetch(until_eof=True), + samfile1.fetch(until_eof=True, + multiple_iterators=True)): + self.assertEqual(a.compare(b), 0) + + +class TestDoubleFetchCRAM(TestDoubleFetchBAM): + filename = os.path.join(DATADIR, 'ex2.cram') + mode = "rc" + - for a, b in zip(samfile1.fetch(until_eof=True), - samfile1.fetch(until_eof=True, - multiple_iterators=True)): - self.assertEqual(a.compare(b), 0) +class TestDoubleFetchCRAMWithReference(TestDoubleFetchBAM): + filename = os.path.join(DATADIR, 'ex2.cram') + mode = "rc" + reference_filename = os.path.join(DATADIR, 'ex1.fa') class TestRemoteFileFTP(unittest.TestCase): @@ -1926,10 +1943,11 @@ class TestRemoteFileHTTP(unittest.TestCase): if not checkURL(self.url): return - samfile = pysam.AlignmentFile(self.url, "rb") - result = list(samfile.fetch(region=self.region)) - samfile_local = pysam.AlignmentFile(self.local, "rb") - ref = list(samfile_local.fetch(region=self.region)) + with pysam.AlignmentFile(self.url, "rb") as samfile: + result = list(samfile.fetch(region=self.region)) + + with pysam.AlignmentFile(self.local, "rb") as samfile_local: + ref = list(samfile_local.fetch(region=self.region)) self.assertEqual(len(ref), len(result)) for x, y in zip(result, ref): @@ -1939,10 +1957,11 @@ class TestRemoteFileHTTP(unittest.TestCase): if not checkURL(self.url): return - samfile = pysam.AlignmentFile(self.url, "rb") - result = list(samfile.fetch()) - samfile_local = pysam.AlignmentFile(self.local, "rb") - ref = list(samfile_local.fetch()) + with pysam.AlignmentFile(self.url, "rb") as samfile: + result = list(samfile.fetch()) + + with pysam.AlignmentFile(self.local, "rb") as samfile_local: + ref = list(samfile_local.fetch()) self.assertEqual(len(ref), len(result)) for x, y in zip(result, ref): @@ -2009,6 +2028,10 @@ class TestPileup(unittest.TestCase): self.samfile = pysam.AlignmentFile(self.samfilename) self.fastafile = pysam.Fastafile(self.fastafilename) + def tearDown(self): + self.samfile.close() + self.fastafile.close() + def checkEqual(self, references, iterator): for x, column in enumerate(iterator): @@ -2070,6 +2093,10 @@ class TestCountCoverage(unittest.TestCase): samfile.close() pysam.samtools.index("test_count_coverage_read_all.bam") + def tearDown(self): + self.samfile.close() + self.fastafile.close() + def count_coverage_python(self, bam, chrom, start, stop, read_callback, quality_threshold=15): @@ -2161,23 +2188,26 @@ class TestCountCoverage(unittest.TestCase): self.assertEqual(fast_counts[3], manual_counts[3]) def test_count_coverage_read_all(self): - samfile = pysam.AlignmentFile("test_count_coverage_read_all.bam") + chrom = 'chr1' start = 0 stop = 2000 def filter(read): return not (read.flag & (0x4 | 0x100 | 0x200 | 0x400)) - fast_counts = samfile.count_coverage( - chrom, start, stop, - read_callback='all', - #read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)), - quality_threshold=0) - manual_counts = samfile.count_coverage( - chrom, start, stop, - read_callback=lambda read: not( - read.flag & (0x4 | 0x100 | 0x200 | 0x400)), - quality_threshold=0) + + with pysam.AlignmentFile("test_count_coverage_read_all.bam") as samfile: + + fast_counts = samfile.count_coverage( + chrom, start, stop, + read_callback='all', + #read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)), + quality_threshold=0) + manual_counts = samfile.count_coverage( + chrom, start, stop, + read_callback=lambda read: not( + read.flag & (0x4 | 0x100 | 0x200 | 0x400)), + quality_threshold=0) os.unlink("test_count_coverage_read_all.bam") os.unlink("test_count_coverage_read_all.bam.bai") @@ -2202,18 +2232,20 @@ class TestCountCoverage(unittest.TestCase): samfile.write(read) samfile.close() pysam.samtools.index("test_count_coverage_nofilter.bam") - samfile = pysam.AlignmentFile("test_count_coverage_nofilter.bam") chr = 'chr1' start = 0 stop = 2000 - fast_counts = samfile.count_coverage(chr, start, stop, - read_callback='nofilter', - quality_threshold=0) - manual_counts = self.count_coverage_python(samfile, chr, start, stop, - read_callback=lambda x: True, - quality_threshold=0) - samfile.close() + with pysam.AlignmentFile("test_count_coverage_nofilter.bam") as samfile: + + fast_counts = samfile.count_coverage(chr, start, stop, + read_callback='nofilter', + quality_threshold=0) + + manual_counts = self.count_coverage_python(samfile, chr, start, stop, + read_callback=lambda x: True, + quality_threshold=0) + os.unlink("test_count_coverage_nofilter.bam") os.unlink("test_count_coverage_nofilter.bam.bai") self.assertEqual(fast_counts[0], manual_counts[0]) @@ -2223,7 +2255,7 @@ class TestCountCoverage(unittest.TestCase): class TestPileupQueryPosition(unittest.TestCase): - + filename = "test_query_position.bam" def testPileup(self): @@ -2260,8 +2292,8 @@ class TestLogging(unittest.TestCase): log_hand.setFormatter(formatter) logger.addHandler(log_hand) - bam = pysam.AlignmentFile(bamfile, 'rb') - cols = bam.pileup() + with pysam.AlignmentFile(bamfile, 'rb') as bam: + cols = bam.pileup() self.assertTrue(True) def testFail1(self): @@ -2292,40 +2324,41 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase): def testCount(self): - samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), - "rb") + with pysam.AlignmentFile( + os.path.join(DATADIR, "ex1.bam"), + "rb") as samfile: - for contig in ("chr1", "chr2"): - for start in range(0, 2000, 100): - end = start + 1 - self.assertEqual( - len(list(samfile.fetch(contig, start, end))), - samfile.count(contig, start, end), - 'number mismatch for %s:%i-%i %i != %i' % ( - contig, start, end, + for contig in ("chr1", "chr2"): + for start in range(0, 2000, 100): + end = start + 1 + self.assertEqual( len(list(samfile.fetch(contig, start, end))), - samfile.count(contig, start, end))) + samfile.count(contig, start, end), + 'number mismatch for %s:%i-%i %i != %i' % ( + contig, start, end, + len(list(samfile.fetch(contig, start, end))), + samfile.count(contig, start, end))) - # test empty intervals - self.assertEqual( - len(list(samfile.fetch(contig, start, start))), - samfile.count(contig, start, start), - 'number mismatch for %s:%i-%i %i != %i' % ( - contig, start, start, + # test empty intervals + self.assertEqual( len(list(samfile.fetch(contig, start, start))), - samfile.count(contig, start, start))) + samfile.count(contig, start, start), + 'number mismatch for %s:%i-%i %i != %i' % ( + contig, start, start, + len(list(samfile.fetch(contig, start, start))), + samfile.count(contig, start, start))) - # test half empty intervals - self.assertEqual(len(list(samfile.fetch(contig, start))), - samfile.count(contig, start)) + # test half empty intervals + self.assertEqual(len(list(samfile.fetch(contig, start))), + samfile.count(contig, start)) - self.assertEqual( - len(list(samfile.fetch(contig, start))), - samfile.count(contig, start), - 'number mismatch for %s:%i %i != %i' % ( - contig, start, + self.assertEqual( len(list(samfile.fetch(contig, start))), - samfile.count(contig, start))) + samfile.count(contig, start), + 'number mismatch for %s:%i %i != %i' % ( + contig, start, + len(list(samfile.fetch(contig, start))), + samfile.count(contig, start))) def testMate(self): '''test mate access.''' @@ -2339,35 +2372,35 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase): for x in readnames: counts[x] += 1 - samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), - "rb") + with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), + "rb") as samfile: - for read in samfile.fetch(): - if not read.is_paired: - self.assertRaises(ValueError, samfile.mate, read) - elif read.mate_is_unmapped: - self.assertRaises(ValueError, samfile.mate, read) - else: - if counts[read.query_name] == 1: + for read in samfile.fetch(): + if not read.is_paired: + self.assertRaises(ValueError, samfile.mate, read) + elif read.mate_is_unmapped: self.assertRaises(ValueError, samfile.mate, read) else: - mate = samfile.mate(read) - self.assertEqual(read.query_name, mate.query_name) - self.assertEqual(read.is_read1, mate.is_read2) - self.assertEqual(read.is_read2, mate.is_read1) - self.assertEqual( - read.reference_start, mate.next_reference_start) - self.assertEqual( - read.next_reference_start, mate.reference_start) + if counts[read.query_name] == 1: + self.assertRaises(ValueError, samfile.mate, read) + else: + mate = samfile.mate(read) + self.assertEqual(read.query_name, mate.query_name) + self.assertEqual(read.is_read1, mate.is_read2) + self.assertEqual(read.is_read2, mate.is_read1) + self.assertEqual( + read.reference_start, mate.next_reference_start) + self.assertEqual( + read.next_reference_start, mate.reference_start) def testIndexStats(self): '''test if total number of mapped/unmapped reads is correct.''' - samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), - "rb") - self.assertEqual(samfile.mapped, 3235) - self.assertEqual(samfile.unmapped, 35) - self.assertEqual(samfile.nocoordinate, 0) + with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), + "rb") as samfile: + self.assertEqual(samfile.mapped, 3235) + self.assertEqual(samfile.unmapped, 35) + self.assertEqual(samfile.nocoordinate, 0) class TestMappedUnmapped(unittest.TestCase): @@ -2452,26 +2485,29 @@ class TestAlignmentFileIndex(unittest.TestCase): class TestExplicitIndex(unittest.TestCase): def testExplicitIndexBAM(self): - samfile = pysam.AlignmentFile( - os.path.join(DATADIR, "explicit_index.bam"), - "rb", - filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) - - samfile.fetch("chr1") + with pysam.AlignmentFile( + os.path.join(DATADIR, "explicit_index.bam"), + "rb", + filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) as samfile: + samfile.fetch("chr1") def testExplicitIndexCRAM(self): - samfile = pysam.AlignmentFile( - os.path.join(DATADIR, "explicit_index.cram"), - "rc", - filepath_index=os.path.join(DATADIR, 'ex1.cram.crai')) + with pysam.AlignmentFile( + os.path.join(DATADIR, "explicit_index.cram"), + "rc", + filepath_index=os.path.join(DATADIR, 'ex1.cram.crai')) as samfile: + samfile.fetch("chr1") def testRemoteExplicitIndexBAM(self): - samfile = pysam.AlignmentFile( - "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam", - "rb", - filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) + if not checkURL( + "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam"): + return - samfile.fetch("chr1") + with pysam.AlignmentFile( + "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam", + "rb", + filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) as samfile: + samfile.fetch("chr1") class TestVerbosity(unittest.TestCase): diff --git a/tests/TestUtils.py b/tests/TestUtils.py index efb2333..71ab22a 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -18,15 +18,28 @@ else: if IS_PYTHON3: def force_str(s): - return s.decode('ascii') + try: + return s.decode('ascii') + except AttributeError: + return s + def force_bytes(s): + try: + return s.encode('ascii') + except AttributeError: + return s else: def force_str(s): return s + def force_bytes(s): + return s def openfile(fn): if fn.endswith(".gz"): - return gzip.open(fn) + try: + return gzip.open(fn, "rt", encoding="utf-8") + except TypeError: + return gzip.open(fn, "r") else: return open(fn) @@ -59,8 +72,9 @@ def checkBinaryEqual(filename1, filename2): return found -def checkSamtoolsViewEqual(filename1, filename2, - without_header=False): +def check_samtools_view_equal( + filename1, filename2, + without_header=False): '''return true if the two files are equal in their content through samtools view. ''' @@ -139,7 +153,7 @@ def checkFieldEqual(cls, read1, read2, exclude=[]): (n, getattr(read1, n), getattr(read2, n))) -def check_lines_equal(cls, a, b, sort=False, filter_f=None): +def check_lines_equal(cls, a, b, sort=False, filter_f=None, msg=None): """check if contents of two files are equal comparing line-wise. sort: bool @@ -147,17 +161,17 @@ def check_lines_equal(cls, a, b, sort=False, filter_f=None): filter_f: remover lines in both a and b where expression is True """ - aa = openfile(a).readlines() bb = openfile(b).readlines() if filter_f is not None: - aa = [x for x in aa if not filter_f] - bb = [x for x in bb if not filter_f] + aa = [x for x in aa if not filter_f(x)] + bb = [x for x in bb if not filter_f(x)] + if sort: - cls.assertEqual(sorted(aa), sorted(bb)) + cls.assertEqual(sorted(aa), sorted(bb), msg) else: - cls.assertEqual(aa, bb) + cls.assertEqual(aa, bb, msg) def get_temp_filename(suffix=""): diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index a7e54ac..ef21245 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -2,6 +2,7 @@ import os import unittest import pysam import gzip +import subprocess from TestUtils import get_temp_filename, check_lines_equal DATADIR="cbcf_data" @@ -9,7 +10,6 @@ from tabix_test import loadAndConvert def read_header(filename): - data = [] if filename.endswith(".gz"): for line in gzip.open(filename): @@ -21,6 +21,7 @@ def read_header(filename): for line in f: if line.startswith("#"): data.append(line) + return data @@ -135,8 +136,9 @@ class TestOpening(unittest.TestCase): self.assertEqual(len(list(inf.fetch())), 5) def testDetectBCF(self): - with pysam.VariantFile(os.path.join(DATADIR, - "example_vcf40.bcf")) as inf: + with pysam.VariantFile(os.path.join( + DATADIR, + "example_vcf40.bcf")) as inf: self.assertEqual(inf.category, 'VARIANTS') self.assertEqual(inf.format, 'BCF') self.assertEqual(inf.compression, 'BGZF') @@ -333,7 +335,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase): check_lines_equal( self, fn_in, fn_out, sort=True, - filter_f=lambda x: not x.startswith("##contig")) + filter_f=lambda x: x.startswith("##contig")) os.unlink(fn_out) def testConstructionWithRecords(self): @@ -413,6 +415,43 @@ class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs): filename = "example_vcf42.vcf.gz" +class TestSettingRecordValues(unittest.TestCase): + + filename = "example_vcf40.vcf" + + def testSetQual(self): + with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: + record = next(inf) + self.assertEqual(record.qual, 47) + record.qual = record.qual + self.assertEqual(record.qual, 47) + record.qual = 10 + self.assertEqual(record.qual, 10) + self.assertEqual(str(record).split("\t")[5], "10") + + def testGenotype(self): + with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: + record = next(inf) + sample = record.samples["NA00001"] + print (sample["GT"]) + self.assertEqual(sample["GT"], (0, 0)) +# Fails with TypeError +# sample["GT"] = sample["GT"] + +class TestSubsetting(unittest.TestCase): + + filename = "example_vcf42.vcf.gz" + + def testSubsetting(self): + with pysam.VariantFile(os.path.join(DATADIR, + self.filename)) as inf: + inf.subset_samples(["NA00001"]) + if __name__ == "__main__": + # build data files + print ("building data files") + subprocess.call("make -C %s" % DATADIR, shell=True) + print ("starting tests") unittest.main() + print ("completed tests") diff --git a/tests/cbcf_data/example_vcf42.vcf b/tests/cbcf_data/example_vcf42.vcf index c6c7030..f103e1f 100644 --- a/tests/cbcf_data/example_vcf42.vcf +++ b/tests/cbcf_data/example_vcf42.vcf @@ -17,8 +17,8 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +M 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:. 17 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. -20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 -20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:. 20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/tests/faidx_test.py b/tests/faidx_test.py index f3e6cc4..a123550 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -2,6 +2,7 @@ import pysam import unittest import os import gzip +import shutil from TestUtils import checkURL @@ -56,6 +57,53 @@ class TestFastaFile(unittest.TestCase): self.file.close() +class TestFastaFilePathIndex(unittest.TestCase): + + filename = os.path.join(DATADIR, "ex1.fa") + + def testGarbageIndex(self): + self.assertRaises(NotImplementedError, + pysam.FastaFile, + self.filename, + filepath_index="garbage.fa.fai") + return + + self.assertRaises(ValueError, + pysam.FastaFile, + self.filename, + filepath_index="garbage.fa.fai") + + def testOpenWithoutIndex(self): + faidx = pysam.FastaFile(self.filename) + faidx.close() + + def testOpenWithStandardIndex(self): + self.assertRaises(NotImplementedError, + pysam.FastaFile, + self.filename, + filepath_index=self.filename + ".fai") + return + + faidx = pysam.FastaFile(self.filename, + filepath_index=self.filename + ".fai") + faidx.close() + + def testOpenWithOtherIndex(self): + return + tmpfilename = "tmp_" + os.path.basename(self.filename) + shutil.copyfile(self.filename, tmpfilename) + faidx = pysam.FastaFile(tmpfilename, + filepath_index=self.filename + ".fai") + faidx.close() + # index should not be auto-generated + self.assertFalse(os.path.exists(tmpfilename + ".fai")) + os.unlink(tmpfilename) + +class TestFastaFilePathIndexCompressed(TestFastaFilePathIndex): + + filename = os.path.join(DATADIR, "ex1.fa.gz") + + class TestFastxFileFastq(unittest.TestCase): filetype = pysam.FastxFile @@ -67,6 +115,9 @@ class TestFastxFileFastq(unittest.TestCase): persist=self.persist) self.has_quality = self.filename.endswith('.fq') + def tearDown(self): + self.file.close() + def checkFirst(self, s): # test first entry self.assertEqual(s.sequence, "GGGAACAGGGGGGTGCACTAATGCGCTCCACGCCC") @@ -160,8 +211,8 @@ class TestFastxFileWithEmptySequence(unittest.TestCase): with gzip.open(fn) as inf: ref_num = len(list(inf)) / 4 - f = self.filetype(fn) - l = len(list(f)) + with self.filetype(fn) as f: + l = len(list(f)) self.assertEqual(ref_num, l) @@ -175,10 +226,10 @@ class TestRemoteFileFTP(unittest.TestCase): def testFTPView(self): if not checkURL(self.url): return - f = pysam.Fastafile(self.url) - self.assertEqual( - len(f.fetch("chr1", 0, 1000)), - 1000) + with pysam.Fastafile(self.url) as f: + self.assertEqual( + len(f.fetch("chr1", 0, 1000)), + 1000) if __name__ == "__main__": diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index aed77b5..89a4a0c 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -17,7 +17,8 @@ all: ex1.pileup.gz \ ex2_truncated.bam \ empty.bam empty.bam.bai \ explicit_index.bam explicit_index.cram \ - faidx_empty_seq.fq.gz + faidx_empty_seq.fq.gz \ + ex1.fa.gz ex1.fa.gz.fai # ex2.sam - as ex1.sam, but with header ex2.sam.gz: ex1.bam ex1.bam.bai @@ -82,3 +83,9 @@ clean: %.fq.gz: %.fq gzip < $< > $@ + +%.fa.gz: %.fa + bgzip < $< > $@ + +%.fa.gz.fai: %.fa.gz + samtools faidx $< diff --git a/tests/samtools_test.py b/tests/samtools_test.py index e5fd8b9..d5b2791 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -15,7 +15,8 @@ import glob import sys import subprocess import shutil -from TestUtils import checkBinaryEqual +from TestUtils import checkBinaryEqual, check_lines_equal, \ + check_samtools_view_equal, get_temp_filename, force_bytes IS_PYTHON3 = sys.version_info[0] >= 3 @@ -80,6 +81,8 @@ class SamtoolsTest(unittest.TestCase): "idxstats ex1.bam > %(out)s_ex1.idxstats", "fixmate ex1.bam %(out)s_ex1.fixmate.bam", "flagstat ex1.bam > %(out)s_ex1.flagstat", + # Fails python 3.3 on linux, passes on OsX and when + # run locally "calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam", # use -s option, otherwise the following error in samtools 1.2: # Samtools-htslib-API: bam_get_library() not yet implemented @@ -132,7 +135,7 @@ class SamtoolsTest(unittest.TestCase): samtools_version)) def setUp(self): - '''setup tests. + '''setup tests. For setup, all commands will be run before the first test is executed. Individual tests will then just compare the output @@ -146,7 +149,7 @@ class SamtoolsTest(unittest.TestCase): os.makedirs(WORKDIR) for f in self.requisites: - shutil.copy(os.path.join(DATADIR, f), + shutil.copy(os.path.join(DATADIR, f), os.path.join(WORKDIR, f)) self.savedir = os.getcwd() @@ -184,13 +187,11 @@ class SamtoolsTest(unittest.TestCase): output = pysam_method(*pysam_parts, raw=True, catch_stdout=True) - # sys.stdout.write(" pysam ok\n") - if ">" in statement: with open(pysam_targets[-1], "wb") as outfile: if output is not None: - outfile = outfile.write(output) + outfile.write(force_bytes(output)) for samtools_target, pysam_target in zip(samtools_targets, pysam_targets): @@ -204,17 +205,32 @@ class SamtoolsTest(unittest.TestCase): else: samtools_files = [samtools_target] pysam_files = [pysam_target] - + for s, p in zip(samtools_files, pysam_files): - self.assertTrue( - checkBinaryEqual(s, p), - "%s failed: files %s and %s are not the same" % - (command, s, p)) + binary_equal = checkBinaryEqual(s, p) + error_msg = "%s failed: files %s and %s are not the same" % (command, s, p) + if binary_equal: + continue + if s.endswith(".bam"): + self.assertTrue( + check_samtools_view_equal( + s, p, without_header=True), + error_msg) + check_lines_equal( + self, s, p, + filter_f=lambda x: x.startswith("#"), + msg=error_msg) def testStatements(self): for statement in self.statements: + if (statement.startswith("calmd") and + list(sys.version_info[:2]) == [3, 3]): + # skip calmd test, fails only on python 3.3.5 + # in linux (empty output). Works in OsX and passes + # for 3.4 and 3.5, see issue #293 + continue self.check_statement(statement) - + def tearDown(self): if os.path.exists(WORKDIR): shutil.rmtree(WORKDIR) @@ -227,6 +243,28 @@ class EmptyIndexTest(unittest.TestCase): self.assertRaises(IOError, pysam.samtools.index, "exdoesntexist.bam") +class TestReturnType(unittest.TestCase): + + def testReturnValueString(self): + retval = pysam.idxstats(os.path.join(DATADIR, "ex1.bam")) + if IS_PYTHON3: + self.assertFalse(isinstance(retval, bytes)) + self.assertTrue(isinstance(retval, str)) + else: + self.assertTrue(isinstance(retval, bytes)) + self.assertTrue(isinstance(retval, basestring)) + + def testReturnValueData(self): + args = "-O BAM {}".format(os.path.join(DATADIR, "ex1.bam")).split(" ") + retval = pysam.view(*args) + + if IS_PYTHON3: + self.assertTrue(isinstance(retval, bytes)) + self.assertFalse(isinstance(retval, str)) + else: + self.assertTrue(isinstance(retval, bytes)) + self.assertTrue(isinstance(retval, basestring)) + class StdoutTest(unittest.TestCase): '''test if stdout can be redirected.''' @@ -242,11 +280,29 @@ class StdoutTest(unittest.TestCase): catch_stdout=False) self.assertEqual(r, None) + def testDoubleCalling(self): + # The following would fail if there is an + # issue with stdout being improperly caught. + retvals = pysam.idxstats( + os.path.join(DATADIR, "ex1.bam")) + retvals = pysam.idxstats( + os.path.join(DATADIR, "ex1.bam")) + + def testSaveStdout(self): + outfile = get_temp_filename(suffix=".tsv") + r = pysam.samtools.flagstat( + os.path.join(DATADIR, "ex1.bam"), + save_stdout=outfile) + self.assertEqual(r, None) + with open(outfile) as inf: + r = inf.read() + self.assertTrue(len(r) > 0) + class PysamTest(SamtoolsTest): """check access to samtools command in the pysam main package. - + This is for backwards capability. """ diff --git a/tests/tabix_test.py b/tests/tabix_test.py index f09ba8c..ec1e37e 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -270,6 +270,9 @@ class TestIterationWithoutComments(IterationTest): IterationTest.setUp(self) self.tabix = pysam.TabixFile(self.filename) + def tearDown(self): + self.tabix.close() + def testRegionStrings(self): """test if access with various region strings works""" @@ -351,7 +354,7 @@ class TestIterationWithoutComments(IterationTest): self.tabix.fetch("chr1", 100, 100) def testGetContigs(self): - self.assertEqual(sorted(self.tabix.contigs), [b"chr1", b"chr2"]) + self.assertEqual(sorted(self.tabix.contigs), ["chr1", "chr2"]) # check that contigs is read-only self.assertRaises( AttributeError, setattr, self.tabix, "contigs", ["chr1", "chr2"]) @@ -374,13 +377,10 @@ class TestIterationWithoutComments(IterationTest): # opens any tabix file with pysam.TabixFile(self.filename) as inf: pass - + for i in range(1000): func1() - def tearDown(self): - self.tabix.close() - class TestIterationWithComments(TestIterationWithoutComments): @@ -405,6 +405,9 @@ class TestParser(unittest.TestCase): self.tabix = pysam.TabixFile(self.filename) self.compare = loadAndConvert(self.filename) + def tearDown(self): + self.tabix.close() + def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): @@ -505,6 +508,36 @@ class TestParser(unittest.TestCase): self.assertEqual(a, b) +class TestGTF(TestParser): + + def testRead(self): + + for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())): + c = self.compare[x] + self.assertEqual(len(c), len(r)) + self.assertEqual(list(c), list(r)) + self.assertEqual(c, str(r).split("\t")) + self.assertTrue(r.gene_id.startswith("ENSG")) + if r.feature != 'gene': + self.assertTrue(r.transcript_id.startswith("ENST")) + self.assertEqual(c[0], r.contig) + self.assertEqual("\t".join(map(str, c)), + str(r)) + + def testSetting(self): + + for r in self.tabix.fetch(parser=pysam.asGTF()): + r.contig = r.contig + "_test" + r.source = r.source + "_test" + r.feature = r.feature + "_test" + r.start += 10 + r.end += 10 + r.score = 20 + r.strand = "+" + r.frame = 0 + r.attributes = 'gene_id "0001";' + + class TestIterators(unittest.TestCase): filename = os.path.join(DATADIR, "example.gtf.gz") @@ -522,6 +555,10 @@ class TestIterators(unittest.TestCase): open(self.tmpfilename_uncompressed, "wb") as outfile: outfile.write(infile.read()) + def tearDown(self): + self.tabix.close() + os.unlink(self.tmpfilename_uncompressed) + def open(self): if self.is_compressed: @@ -566,9 +603,6 @@ class TestIterators(unittest.TestCase): # Not implemented # self.assertRaises(ValueError, i.next) - def tearUp(self): - os.unlink(self.tmpfilename_uncompressed) - class TestIteratorsGenericCompressed(TestIterators): is_compressed = True @@ -584,23 +618,6 @@ class TestIteratorsFileUncompressed(TestIterators): is_compressed = False -class TestGTF(TestParser): - - def testRead(self): - - for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())): - c = self.compare[x] - self.assertEqual(len(c), len(r)) - self.assertEqual(list(c), list(r)) - self.assertEqual(c, str(r).split("\t")) - self.assertTrue(r.gene_id.startswith("ENSG")) - if r.feature != 'gene': - self.assertTrue(r.transcript_id.startswith("ENST")) - self.assertEqual(c[0], r.contig) - self.assertEqual("\t".join(map(str, c)), - str(r)) - - class TestIterationMalformattedGTFFiles(unittest.TestCase): '''test reading from malformatted gtf files.''' @@ -638,6 +655,9 @@ class TestBed(unittest.TestCase): self.tabix = pysam.TabixFile(self.filename) self.compare = loadAndConvert(self.filename) + def tearDown(self): + self.tabix.close() + def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asBed())): @@ -670,9 +690,6 @@ class TestBed(unittest.TestCase): self.assertEqual(int(c[2]) + 1, r.end) self.assertEqual(str(int(c[2]) + 1), r[2]) - def tearDown(self): - self.tabix.close() - class TestVCF(unittest.TestCase): @@ -736,6 +753,9 @@ class TestVCFFromTabix(TestVCF): self.tabix = pysam.TabixFile(self.tmpfilename + ".gz") self.compare = loadAndConvert(self.filename) + def tearDown(self): + self.tabix.close() + def testRead(self): ncolumns = len(self.columns) @@ -804,9 +824,6 @@ class TestVCFFromTabix(TestVCF): c[ncolumns + y] = "test_%i" % y r[y] = "test_%i" % y self.assertEqual(c[ncolumns + y], r[y]) - - def tearDown(self): - self.tabix.close() class TestVCFFromVCF(TestVCF): @@ -843,6 +860,9 @@ class TestVCFFromVCF(TestVCF): self.vcf = pysam.VCF() self.compare = loadAndConvert(self.filename, encode=False) + def tearDown(self): + self.vcf.close() + def testConnecting(self): fn = os.path.basename(self.filename) @@ -856,15 +876,25 @@ class TestVCFFromVCF(TestVCF): def get_iterator(self): - f = open(self.filename) - fn = os.path.basename(self.filename) + with open(self.filename) as f: + fn = os.path.basename(self.filename) - for x, msg in self.fail_on_opening: - if "%i.vcf" % x == fn: - self.assertRaises(ValueError, self.vcf.parse, f) - return + for x, msg in self.fail_on_opening: + if "%i.vcf" % x == fn: + self.assertRaises(ValueError, self.vcf.parse, f) + return - return self.vcf.parse(f) + for vcf_code, msg in self.fail_on_parsing: + if "%i.vcf" % vcf_code == fn: + self.assertRaises((ValueError, + AssertionError), + list, self.vcf.parse(f)) + return + # python 2.7 + # self.assertRaisesRegexp( + # ValueError, re.compile(msg), self.vcf.parse, f) + + return list(self.vcf.parse(f)) def get_field_value(self, record, field): return record[field] @@ -1063,6 +1093,8 @@ class TestVCFFromVariantFile(TestVCFFromVCF): missing_value = None missing_quality = None + vcf = None + def filter2value(self, r, v): if r == "PASS": return ["PASS"], list(v) @@ -1104,9 +1136,14 @@ class TestVCFFromVariantFile(TestVCFFromVCF): TestVCF.setUp(self) self.compare = loadAndConvert(self.filename, encode=False) + def tearDown(self): + if self.vcf: + self.vcf.close() + self.vcf = None + def get_iterator(self): - vcf = pysam.VariantFile(self.filename) - return vcf.fetch() + self.vcf = pysam.VariantFile(self.filename) + return self.vcf.fetch() def get_field_value(self, record, field): return getattr(record, field) @@ -1124,11 +1161,22 @@ class TestRemoteFileHTTP(unittest.TestCase): local = os.path.join(DATADIR, "example.gtf.gz") def setUp(self): + if not checkURL(self.url): + self.remote_file = None + return + self.remote_file = pysam.TabixFile(self.url, "r") self.local_file = pysam.TabixFile(self.local, "r") + def tearDown(self): + if self.remote_file is None: + return + + self.remote_file.close() + self.local_file.close() + def testFetchAll(self): - if not checkURL(self.url): + if self.remote_file is None: return remote_result = list(self.remote_file.fetch()) @@ -1139,16 +1187,15 @@ class TestRemoteFileHTTP(unittest.TestCase): self.assertEqual(x, y) def testHeader(self): + if self.remote_file is None: + return + self.assertEqual(list(self.local_file.header), []) self.assertRaises(AttributeError, getattr, self.remote_file, "header") - def tearDown(self): - self.remote_file.close() - self.local_file.close() - class TestIndexArgument(unittest.TestCase): @@ -1163,13 +1210,11 @@ class TestIndexArgument(unittest.TestCase): shutil.copyfile(self.index_src, self.index_dst) with pysam.TabixFile( - self.filename_src, "r", index=self.index_src) as \ - same_basename_file: + self.filename_src, "r", index=self.index_src) as same_basename_file: same_basename_results = list(same_basename_file.fetch()) with pysam.TabixFile( - self.filename_dst, "r", index=self.index_dst) as \ - diff_index_file: + self.filename_dst, "r", index=self.index_dst) as diff_index_file: diff_index_result = list(diff_index_file.fetch()) self.assertEqual(len(same_basename_results), len(diff_index_result)) @@ -1263,7 +1308,7 @@ class TestMultipleIterators(unittest.TestCase): def testDoubleFetch(self): - with pysam.TabixFile(self.filename) as f: + with pysam.TabixFile(self.filename) as f: for a, b in zip(f.fetch(multiple_iterators=True), f.fetch(multiple_iterators=True)): diff --git a/tests/test_samtools_python.py b/tests/test_samtools_python.py new file mode 100644 index 0000000..1b915fd --- /dev/null +++ b/tests/test_samtools_python.py @@ -0,0 +1,35 @@ +import pysam + +def test_idxstats_parse_split_lines(): + bam_filename = "./pysam_data/ex2.bam" + lines = pysam.idxstats(bam_filename, split_lines=True) # Test pysam 0.8.X style output, which returns a list of lines + for line in lines: + _seqname, _seqlen, nmapped, _nunmapped = line.split() + + +def test_bedcov_split_lines(): + bam_filename = "./pysam_data/ex1.bam" + bed_filename = "./pysam_data/ex1.bed" + lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True) # Test pysam 0.8.X style output, which returns a list of lines + for line in lines: + fields = line.split('\t') + assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % (fields, len(fields)) + + +def test_idxstats_parse(): + bam_filename = "./pysam_data/ex2.bam" + idxstats_string = pysam.idxstats(bam_filename, split_lines=False) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n + lines = idxstats_string.splitlines() + for line in lines: + splt = line.split("\t") + _seqname, _seqlen, nmapped, _nunmapped = splt + + +def test_bedcov(): + bam_filename = "./pysam_data/ex1.bam" + bed_filename = "./pysam_data/ex1.bed" + bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n + lines = bedcov_string.splitlines() + for line in lines: + fields = line.split('\t') + assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % (fields, len(fields))