5953 lines
197 KiB
C
5953 lines
197 KiB
C
|
|
/* vcf.c -- VCF/BCF API functions.
|
||
|
|
|
||
|
|
Copyright (C) 2012, 2013 Broad Institute.
|
||
|
|
Copyright (C) 2012-2024 Genome Research Ltd.
|
||
|
|
Portions copyright (C) 2014 Intel Corporation.
|
||
|
|
|
||
|
|
Author: Heng Li <lh3@sanger.ac.uk>
|
||
|
|
|
||
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
|
of this software and associated documentation files (the "Software"), to deal
|
||
|
|
in the Software without restriction, including without limitation the rights
|
||
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
|
copies of the Software, and to permit persons to whom the Software is
|
||
|
|
furnished to do so, subject to the following conditions:
|
||
|
|
|
||
|
|
The above copyright notice and this permission notice shall be included in
|
||
|
|
all copies or substantial portions of the Software.
|
||
|
|
|
||
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||
|
|
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||
|
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||
|
|
DEALINGS IN THE SOFTWARE. */
|
||
|
|
|
||
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
||
|
|
#include <config.h>
|
||
|
|
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <assert.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include <strings.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <limits.h>
|
||
|
|
#include <stdint.h>
|
||
|
|
#include <inttypes.h>
|
||
|
|
#include <errno.h>
|
||
|
|
|
||
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||
|
|
#include "fuzz_settings.h"
|
||
|
|
#endif
|
||
|
|
|
||
|
|
#include "htslib/vcf.h"
|
||
|
|
#include "htslib/bgzf.h"
|
||
|
|
#include "htslib/tbx.h"
|
||
|
|
#include "htslib/hfile.h"
|
||
|
|
#include "hts_internal.h"
|
||
|
|
#include "htslib/hts_endian.h"
|
||
|
|
#include "htslib/khash_str2int.h"
|
||
|
|
#include "htslib/kstring.h"
|
||
|
|
#include "htslib/sam.h"
|
||
|
|
#include "htslib/khash.h"
|
||
|
|
|
||
|
|
#if 0
|
||
|
|
// This helps on Intel a bit, often 6-7% faster VCF parsing.
|
||
|
|
// Conversely sometimes harms AMD Zen4 as ~9% slower.
|
||
|
|
// Possibly related to IPC differences. However for now it's just a
|
||
|
|
// curiousity we ignore and stick with the simpler code.
|
||
|
|
//
|
||
|
|
// Left here as a hint for future explorers.
|
||
|
|
static inline int xstreq(const char *a, const char *b) {
|
||
|
|
while (*a && *a == *b)
|
||
|
|
a++, b++;
|
||
|
|
return *a == *b;
|
||
|
|
}
|
||
|
|
|
||
|
|
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
|
||
|
|
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
|
||
|
|
|
||
|
|
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
|
||
|
|
#else
|
||
|
|
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
|
||
|
|
#endif
|
||
|
|
|
||
|
|
typedef khash_t(vdict) vdict_t;
|
||
|
|
|
||
|
|
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
|
||
|
|
typedef khash_t(hdict) hdict_t;
|
||
|
|
|
||
|
|
|
||
|
|
#include "htslib/kseq.h"
|
||
|
|
HTSLIB_EXPORT
|
||
|
|
uint32_t bcf_float_missing = 0x7F800001;
|
||
|
|
|
||
|
|
HTSLIB_EXPORT
|
||
|
|
uint32_t bcf_float_vector_end = 0x7F800002;
|
||
|
|
|
||
|
|
HTSLIB_EXPORT
|
||
|
|
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||
|
|
|
||
|
|
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
|
||
|
|
|
||
|
|
/*
|
||
|
|
Partial support for 64-bit POS and Number=1 INFO tags.
|
||
|
|
Notes:
|
||
|
|
- the support for 64-bit values is motivated by POS and INFO/END for large genomes
|
||
|
|
- the use of 64-bit values does not conform to the specification
|
||
|
|
- cannot output 64-bit BCF and if it does, it is not compatible with anything
|
||
|
|
- experimental, use at your risk
|
||
|
|
*/
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
#define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */
|
||
|
|
#define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */
|
||
|
|
#endif
|
||
|
|
|
||
|
|
#define BCF_IS_64BIT (1<<30)
|
||
|
|
|
||
|
|
|
||
|
|
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
|
||
|
|
// Note that this preserving API and ABI requires that the first element is vdict_t struct
|
||
|
|
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
|
||
|
|
// directly as (vdict_t*)hdr->dict.
|
||
|
|
typedef struct
|
||
|
|
{
|
||
|
|
vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
|
||
|
|
hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
|
||
|
|
size_t *key_len;// length of h->id[BCF_DT_ID] strings
|
||
|
|
}
|
||
|
|
bcf_hdr_aux_t;
|
||
|
|
|
||
|
|
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
|
||
|
|
{
|
||
|
|
return (bcf_hdr_aux_t *)hdr->dict[0];
|
||
|
|
}
|
||
|
|
|
||
|
|
static char *find_chrom_header_line(char *s)
|
||
|
|
{
|
||
|
|
char *nl;
|
||
|
|
if (strncmp(s, "#CHROM\t", 7) == 0) return s;
|
||
|
|
else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
|
||
|
|
else return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*************************
|
||
|
|
*** VCF header parser ***
|
||
|
|
*************************/
|
||
|
|
|
||
|
|
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
|
||
|
|
{
|
||
|
|
const char *ss = s;
|
||
|
|
while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
|
||
|
|
if ( !*ss || ss - s == len)
|
||
|
|
{
|
||
|
|
hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
|
||
|
|
int ret;
|
||
|
|
char *sdup = malloc(len + 1);
|
||
|
|
if (!sdup) return -1;
|
||
|
|
memcpy(sdup, s, len);
|
||
|
|
sdup[len] = 0;
|
||
|
|
|
||
|
|
// Ensure space is available in h->samples
|
||
|
|
size_t n = kh_size(d);
|
||
|
|
char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
|
||
|
|
if (!new_samples) {
|
||
|
|
free(sdup);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
h->samples = new_samples;
|
||
|
|
|
||
|
|
int k = kh_put(vdict, d, sdup, &ret);
|
||
|
|
if (ret < 0) {
|
||
|
|
free(sdup);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if (ret) { // absent
|
||
|
|
kh_val(d, k) = bcf_idinfo_def;
|
||
|
|
kh_val(d, k).id = n;
|
||
|
|
} else {
|
||
|
|
hts_log_error("Duplicated sample name '%s'", sdup);
|
||
|
|
free(sdup);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
h->samples[n] = sdup;
|
||
|
|
h->dirty = 1;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
|
||
|
|
{
|
||
|
|
if (!s) {
|
||
|
|
// Allowed for backwards-compatibility, calling with s == NULL
|
||
|
|
// used to trigger bcf_hdr_sync(h);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return bcf_hdr_add_sample_len(h, s, strlen(s));
|
||
|
|
}
|
||
|
|
|
||
|
|
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
|
||
|
|
{
|
||
|
|
const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
|
||
|
|
if ( strncmp(str,mandatory,strlen(mandatory)) )
|
||
|
|
{
|
||
|
|
hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
const char *beg = str + strlen(mandatory), *end;
|
||
|
|
if ( !*beg || *beg=='\n' ) return 0;
|
||
|
|
if ( strncmp(beg,"\tFORMAT\t",8) )
|
||
|
|
{
|
||
|
|
hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
beg += 8;
|
||
|
|
|
||
|
|
int ret = 0;
|
||
|
|
while ( *beg )
|
||
|
|
{
|
||
|
|
end = beg;
|
||
|
|
while ( *end && *end!='\t' && *end!='\n' ) end++;
|
||
|
|
if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
|
||
|
|
if ( !*end || *end=='\n' || ret<0 ) break;
|
||
|
|
beg = end + 1;
|
||
|
|
}
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_sync(bcf_hdr_t *h)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < 3; i++)
|
||
|
|
{
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[i];
|
||
|
|
khint_t k;
|
||
|
|
if ( h->n[i] < kh_size(d) )
|
||
|
|
{
|
||
|
|
bcf_idpair_t *new_idpair;
|
||
|
|
// this should be true only for i=2, BCF_DT_SAMPLE
|
||
|
|
new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
|
||
|
|
if (!new_idpair) return -1;
|
||
|
|
h->n[i] = kh_size(d);
|
||
|
|
h->id[i] = new_idpair;
|
||
|
|
}
|
||
|
|
for (k=kh_begin(d); k<kh_end(d); k++)
|
||
|
|
{
|
||
|
|
if (!kh_exist(d,k)) continue;
|
||
|
|
h->id[i][kh_val(d,k).id].key = kh_key(d,k);
|
||
|
|
h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Invalidate key length cache
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(h);
|
||
|
|
if (aux && aux->key_len) {
|
||
|
|
free(aux->key_len);
|
||
|
|
aux->key_len = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
h->dirty = 0;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_hrec_destroy(bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
if (!hrec) return;
|
||
|
|
free(hrec->key);
|
||
|
|
if ( hrec->value ) free(hrec->value);
|
||
|
|
int i;
|
||
|
|
for (i=0; i<hrec->nkeys; i++)
|
||
|
|
{
|
||
|
|
free(hrec->keys[i]);
|
||
|
|
free(hrec->vals[i]);
|
||
|
|
}
|
||
|
|
free(hrec->keys);
|
||
|
|
free(hrec->vals);
|
||
|
|
free(hrec);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Copies all fields except IDX.
|
||
|
|
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
int save_errno;
|
||
|
|
bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
|
||
|
|
if (!out) return NULL;
|
||
|
|
|
||
|
|
out->type = hrec->type;
|
||
|
|
if ( hrec->key ) {
|
||
|
|
out->key = strdup(hrec->key);
|
||
|
|
if (!out->key) goto fail;
|
||
|
|
}
|
||
|
|
if ( hrec->value ) {
|
||
|
|
out->value = strdup(hrec->value);
|
||
|
|
if (!out->value) goto fail;
|
||
|
|
}
|
||
|
|
out->nkeys = hrec->nkeys;
|
||
|
|
out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
|
||
|
|
if (!out->keys) goto fail;
|
||
|
|
out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
|
||
|
|
if (!out->vals) goto fail;
|
||
|
|
int i, j = 0;
|
||
|
|
for (i=0; i<hrec->nkeys; i++)
|
||
|
|
{
|
||
|
|
if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
|
||
|
|
if ( hrec->keys[i] ) {
|
||
|
|
out->keys[j] = strdup(hrec->keys[i]);
|
||
|
|
if (!out->keys[j]) goto fail;
|
||
|
|
}
|
||
|
|
if ( hrec->vals[i] ) {
|
||
|
|
out->vals[j] = strdup(hrec->vals[i]);
|
||
|
|
if (!out->vals[j]) goto fail;
|
||
|
|
}
|
||
|
|
j++;
|
||
|
|
}
|
||
|
|
if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
|
||
|
|
return out;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
save_errno = errno;
|
||
|
|
hts_log_error("%s", strerror(errno));
|
||
|
|
bcf_hrec_destroy(out);
|
||
|
|
errno = save_errno;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
|
||
|
|
int i;
|
||
|
|
for (i=0; i<hrec->nkeys; i++)
|
||
|
|
fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
|
||
|
|
fprintf(fp, "\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_header_debug(bcf_hdr_t *hdr)
|
||
|
|
{
|
||
|
|
int i, j;
|
||
|
|
for (i=0; i<hdr->nhrec; i++)
|
||
|
|
{
|
||
|
|
if ( !hdr->hrec[i]->value )
|
||
|
|
{
|
||
|
|
fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
|
||
|
|
fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
|
||
|
|
for (j=1; j<hdr->hrec[i]->nkeys; j++)
|
||
|
|
fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
|
||
|
|
fprintf(stderr,">\n");
|
||
|
|
}
|
||
|
|
else
|
||
|
|
fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
|
||
|
|
{
|
||
|
|
char **tmp;
|
||
|
|
size_t n = hrec->nkeys + 1;
|
||
|
|
assert(len > 0 && len < SIZE_MAX);
|
||
|
|
tmp = realloc(hrec->keys, sizeof(char*)*n);
|
||
|
|
if (!tmp) return -1;
|
||
|
|
hrec->keys = tmp;
|
||
|
|
tmp = realloc(hrec->vals, sizeof(char*)*n);
|
||
|
|
if (!tmp) return -1;
|
||
|
|
hrec->vals = tmp;
|
||
|
|
|
||
|
|
hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
|
||
|
|
if (!hrec->keys[hrec->nkeys]) return -1;
|
||
|
|
memcpy(hrec->keys[hrec->nkeys],str,len);
|
||
|
|
hrec->keys[hrec->nkeys][len] = 0;
|
||
|
|
hrec->vals[hrec->nkeys] = NULL;
|
||
|
|
hrec->nkeys = n;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
|
||
|
|
{
|
||
|
|
if ( hrec->vals[i] ) {
|
||
|
|
free(hrec->vals[i]);
|
||
|
|
hrec->vals[i] = NULL;
|
||
|
|
}
|
||
|
|
if ( !str ) return 0;
|
||
|
|
if ( is_quoted )
|
||
|
|
{
|
||
|
|
if (len >= SIZE_MAX - 3) {
|
||
|
|
errno = ENOMEM;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
|
||
|
|
if (!hrec->vals[i]) return -1;
|
||
|
|
hrec->vals[i][0] = '"';
|
||
|
|
memcpy(&hrec->vals[i][1],str,len);
|
||
|
|
hrec->vals[i][len+1] = '"';
|
||
|
|
hrec->vals[i][len+2] = 0;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
if (len == SIZE_MAX) {
|
||
|
|
errno = ENOMEM;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
|
||
|
|
if (!hrec->vals[i]) return -1;
|
||
|
|
memcpy(hrec->vals[i],str,len);
|
||
|
|
hrec->vals[i][len] = 0;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
|
||
|
|
{
|
||
|
|
int n = hrec->nkeys + 1;
|
||
|
|
char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
|
||
|
|
if (!tmp) return -1;
|
||
|
|
hrec->keys = tmp;
|
||
|
|
|
||
|
|
tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
|
||
|
|
if (!tmp) return -1;
|
||
|
|
hrec->vals = tmp;
|
||
|
|
|
||
|
|
hrec->keys[hrec->nkeys] = strdup("IDX");
|
||
|
|
if (!hrec->keys[hrec->nkeys]) return -1;
|
||
|
|
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
if (kputw(idx, &str) < 0) {
|
||
|
|
free(hrec->keys[hrec->nkeys]);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hrec->vals[hrec->nkeys] = str.s;
|
||
|
|
hrec->nkeys = n;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for (i=0; i<hrec->nkeys; i++)
|
||
|
|
if ( !strcasecmp(key,hrec->keys[i]) ) return i;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
|
||
|
|
else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
|
||
|
|
else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
|
||
|
|
else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
|
||
|
|
else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
|
||
|
|
else hrec->type = BCF_HL_GEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/**
|
||
|
|
The arrays were generated with
|
||
|
|
|
||
|
|
valid_ctg:
|
||
|
|
perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
|
||
|
|
|
||
|
|
valid_tag:
|
||
|
|
perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
|
||
|
|
*/
|
||
|
|
static const uint8_t valid_ctg[256] =
|
||
|
|
{
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
|
||
|
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||
|
|
};
|
||
|
|
static const uint8_t valid_tag[256] =
|
||
|
|
{
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
|
||
|
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||
|
|
};
|
||
|
|
|
||
|
|
/**
|
||
|
|
bcf_hrec_check() - check the validity of structured header lines
|
||
|
|
|
||
|
|
Returns 0 on success or negative value on error.
|
||
|
|
|
||
|
|
Currently the return status is not checked by the caller
|
||
|
|
and only a warning is printed on stderr. This should be improved
|
||
|
|
to propagate the error all the way up to the caller and let it
|
||
|
|
decide what to do: throw an error or proceed anyway.
|
||
|
|
*/
|
||
|
|
static int bcf_hrec_check(bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
bcf_hrec_set_type(hrec);
|
||
|
|
|
||
|
|
if ( hrec->type==BCF_HL_CTG )
|
||
|
|
{
|
||
|
|
i = bcf_hrec_find_key(hrec,"ID");
|
||
|
|
if ( i<0 ) goto err_missing_id;
|
||
|
|
char *val = hrec->vals[i];
|
||
|
|
if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
|
||
|
|
while ( *(++val) )
|
||
|
|
if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if ( hrec->type==BCF_HL_INFO )
|
||
|
|
{
|
||
|
|
i = bcf_hrec_find_key(hrec,"ID");
|
||
|
|
if ( i<0 ) goto err_missing_id;
|
||
|
|
char *val = hrec->vals[i];
|
||
|
|
if ( !strcmp(val,"1000G") ) return 0;
|
||
|
|
if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
|
||
|
|
while ( *(++val) )
|
||
|
|
if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if ( hrec->type==BCF_HL_FMT )
|
||
|
|
{
|
||
|
|
i = bcf_hrec_find_key(hrec,"ID");
|
||
|
|
if ( i<0 ) goto err_missing_id;
|
||
|
|
char *val = hrec->vals[i];
|
||
|
|
if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
|
||
|
|
while ( *(++val) )
|
||
|
|
if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
err_missing_id:
|
||
|
|
hts_log_warning("Missing ID attribute in one or more header lines");
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
err_invalid_ctg:
|
||
|
|
hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
err_invalid_tag:
|
||
|
|
hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline int is_escaped(const char *min, const char *str)
|
||
|
|
{
|
||
|
|
int n = 0;
|
||
|
|
while ( --str>=min && *str=='\\' ) n++;
|
||
|
|
return n%2;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
|
||
|
|
{
|
||
|
|
bcf_hrec_t *hrec = NULL;
|
||
|
|
const char *p = line;
|
||
|
|
if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
|
||
|
|
p += 2;
|
||
|
|
|
||
|
|
const char *q = p;
|
||
|
|
while ( *q && *q!='=' && *q != '\n' ) q++;
|
||
|
|
ptrdiff_t n = q-p;
|
||
|
|
if ( *q!='=' || !n ) // wrong format
|
||
|
|
goto malformed_line;
|
||
|
|
|
||
|
|
hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
|
||
|
|
if (!hrec) { *len = -1; return NULL; }
|
||
|
|
hrec->key = (char*) malloc(sizeof(char)*(n+1));
|
||
|
|
if (!hrec->key) goto fail;
|
||
|
|
memcpy(hrec->key,p,n);
|
||
|
|
hrec->key[n] = 0;
|
||
|
|
hrec->type = -1;
|
||
|
|
|
||
|
|
p = ++q;
|
||
|
|
if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
|
||
|
|
{
|
||
|
|
while ( *q && *q!='\n' ) q++;
|
||
|
|
hrec->value = (char*) malloc((q-p+1)*sizeof(char));
|
||
|
|
if (!hrec->value) goto fail;
|
||
|
|
memcpy(hrec->value, p, q-p);
|
||
|
|
hrec->value[q-p] = 0;
|
||
|
|
*len = q - line + (*q ? 1 : 0); // Skip \n but not \0
|
||
|
|
return hrec;
|
||
|
|
}
|
||
|
|
|
||
|
|
// structured line, e.g.
|
||
|
|
// ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
|
||
|
|
// ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
|
||
|
|
int nopen = 1;
|
||
|
|
while ( *q && *q!='\n' && nopen>0 )
|
||
|
|
{
|
||
|
|
p = ++q;
|
||
|
|
while ( *q && *q==' ' ) { p++; q++; }
|
||
|
|
// ^[A-Za-z_][0-9A-Za-z_.]*$
|
||
|
|
if (p==q && *q && (isalpha_c(*q) || *q=='_'))
|
||
|
|
{
|
||
|
|
q++;
|
||
|
|
while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
|
||
|
|
}
|
||
|
|
n = q-p;
|
||
|
|
int m = 0;
|
||
|
|
while ( *q && *q==' ' ) { q++; m++; }
|
||
|
|
if ( *q!='=' || !n )
|
||
|
|
goto malformed_line;
|
||
|
|
|
||
|
|
if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
|
||
|
|
p = ++q;
|
||
|
|
while ( *q && *q==' ' ) { p++; q++; }
|
||
|
|
|
||
|
|
int quoted = 0;
|
||
|
|
char ending = '\0';
|
||
|
|
switch (*p) {
|
||
|
|
case '"':
|
||
|
|
quoted = 1;
|
||
|
|
ending = '"';
|
||
|
|
p++;
|
||
|
|
break;
|
||
|
|
case '[':
|
||
|
|
quoted = 1;
|
||
|
|
ending = ']';
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if ( quoted ) q++;
|
||
|
|
while ( *q && *q != '\n' )
|
||
|
|
{
|
||
|
|
if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
|
||
|
|
else
|
||
|
|
{
|
||
|
|
if ( *q=='<' ) nopen++;
|
||
|
|
if ( *q=='>' ) nopen--;
|
||
|
|
if ( !nopen ) break;
|
||
|
|
if ( *q==',' && nopen==1 ) break;
|
||
|
|
}
|
||
|
|
q++;
|
||
|
|
}
|
||
|
|
const char *r = q;
|
||
|
|
if (quoted && ending == ']') {
|
||
|
|
if (*q == ending) {
|
||
|
|
r++;
|
||
|
|
q++;
|
||
|
|
quoted = 0;
|
||
|
|
} else {
|
||
|
|
char buffer[320];
|
||
|
|
hts_log_error("Missing ']' in header line %s",
|
||
|
|
hts_strprint(buffer, sizeof(buffer), '"',
|
||
|
|
line, q-line));
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
while ( r > p && r[-1] == ' ' ) r--;
|
||
|
|
if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
|
||
|
|
goto fail;
|
||
|
|
if ( quoted && *q==ending ) q++;
|
||
|
|
if ( *q=='>' )
|
||
|
|
{
|
||
|
|
if (nopen) nopen--; // this can happen with nested angle brackets <>
|
||
|
|
q++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( nopen )
|
||
|
|
hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
|
||
|
|
|
||
|
|
// Skip to end of line
|
||
|
|
int nonspace = 0;
|
||
|
|
p = q;
|
||
|
|
while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
|
||
|
|
if (nonspace) {
|
||
|
|
char buffer[320];
|
||
|
|
hts_log_warning("Dropped trailing junk from header line '%s'",
|
||
|
|
hts_strprint(buffer, sizeof(buffer),
|
||
|
|
'"', line, q - line));
|
||
|
|
}
|
||
|
|
|
||
|
|
*len = q - line + (*q ? 1 : 0);
|
||
|
|
return hrec;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
*len = -1;
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
malformed_line:
|
||
|
|
{
|
||
|
|
char buffer[320];
|
||
|
|
while ( *q && *q!='\n' ) q++; // Ensure *len includes full line
|
||
|
|
hts_log_error("Could not parse the header line: %s",
|
||
|
|
hts_strprint(buffer, sizeof(buffer),
|
||
|
|
'"', line, q - line));
|
||
|
|
*len = q - line + (*q ? 1 : 0);
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
|
||
|
|
{
|
||
|
|
size_t new_n;
|
||
|
|
|
||
|
|
// If available, preserve existing IDX
|
||
|
|
if ( idinfo->id==-1 )
|
||
|
|
idinfo->id = hdr->n[dict_type];
|
||
|
|
else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
|
||
|
|
{
|
||
|
|
hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
|
||
|
|
idinfo->id, tag);
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
|
||
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||
|
|
// hts_resize() can attempt to allocate up to 2 * requested items
|
||
|
|
if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
|
||
|
|
return -1;
|
||
|
|
#endif
|
||
|
|
if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
|
||
|
|
&hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hdr->n[dict_type] = new_n;
|
||
|
|
|
||
|
|
// NB: the next kh_put call can invalidate the idinfo pointer, therefore
|
||
|
|
// we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
|
||
|
|
hdr->id[dict_type][idinfo->id].key = tag;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
|
||
|
|
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
// contig
|
||
|
|
int i, ret, replacing = 0;
|
||
|
|
khint_t k;
|
||
|
|
char *str = NULL;
|
||
|
|
|
||
|
|
bcf_hrec_set_type(hrec);
|
||
|
|
|
||
|
|
if ( hrec->type==BCF_HL_CTG )
|
||
|
|
{
|
||
|
|
hts_pos_t len = 0;
|
||
|
|
|
||
|
|
// Get the contig ID ($str) and length ($j)
|
||
|
|
i = bcf_hrec_find_key(hrec,"length");
|
||
|
|
if ( i<0 ) len = 0;
|
||
|
|
else {
|
||
|
|
char *end = hrec->vals[i];
|
||
|
|
len = strtoll(hrec->vals[i], &end, 10);
|
||
|
|
if (end == hrec->vals[i] || len < 0) return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
i = bcf_hrec_find_key(hrec,"ID");
|
||
|
|
if ( i<0 ) return 0;
|
||
|
|
str = strdup(hrec->vals[i]);
|
||
|
|
if (!str) return -1;
|
||
|
|
|
||
|
|
// Register in the dictionary
|
||
|
|
vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
|
||
|
|
khint_t k = kh_get(vdict, d, str);
|
||
|
|
if ( k != kh_end(d) ) { // already present
|
||
|
|
free(str); str=NULL;
|
||
|
|
if (kh_val(d, k).hrec[0] != NULL) // and not removed
|
||
|
|
return 0;
|
||
|
|
replacing = 1;
|
||
|
|
} else {
|
||
|
|
k = kh_put(vdict, d, str, &ret);
|
||
|
|
if (ret < 0) { free(str); return -1; }
|
||
|
|
}
|
||
|
|
|
||
|
|
int idx = bcf_hrec_find_key(hrec,"IDX");
|
||
|
|
if ( idx!=-1 )
|
||
|
|
{
|
||
|
|
char *tmp = hrec->vals[idx];
|
||
|
|
idx = strtol(hrec->vals[idx], &tmp, 10);
|
||
|
|
if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
|
||
|
|
{
|
||
|
|
if (!replacing) {
|
||
|
|
kh_del(vdict, d, k);
|
||
|
|
free(str);
|
||
|
|
}
|
||
|
|
hts_log_warning("Error parsing the IDX tag, skipping");
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
kh_val(d, k) = bcf_idinfo_def;
|
||
|
|
kh_val(d, k).id = idx;
|
||
|
|
kh_val(d, k).info[0] = len;
|
||
|
|
kh_val(d, k).hrec[0] = hrec;
|
||
|
|
if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
|
||
|
|
if (!replacing) {
|
||
|
|
kh_del(vdict, d, k);
|
||
|
|
free(str);
|
||
|
|
}
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if ( idx==-1 ) {
|
||
|
|
if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( hrec->type==BCF_HL_STR ) return 1;
|
||
|
|
if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
|
||
|
|
|
||
|
|
// INFO/FILTER/FORMAT
|
||
|
|
char *id = NULL;
|
||
|
|
uint32_t type = UINT32_MAX, var = UINT32_MAX;
|
||
|
|
int num = -1, idx = -1;
|
||
|
|
for (i=0; i<hrec->nkeys; i++)
|
||
|
|
{
|
||
|
|
if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
|
||
|
|
else if ( !strcmp(hrec->keys[i], "IDX") )
|
||
|
|
{
|
||
|
|
char *tmp = hrec->vals[i];
|
||
|
|
idx = strtol(hrec->vals[i], &tmp, 10);
|
||
|
|
if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
|
||
|
|
{
|
||
|
|
hts_log_warning("Error parsing the IDX tag, skipping");
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if ( !strcmp(hrec->keys[i], "Type") )
|
||
|
|
{
|
||
|
|
if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
|
||
|
|
else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
|
||
|
|
else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
|
||
|
|
else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
|
||
|
|
else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
|
||
|
|
else
|
||
|
|
{
|
||
|
|
hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
|
||
|
|
type = BCF_HT_STR;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if ( !strcmp(hrec->keys[i], "Number") )
|
||
|
|
{
|
||
|
|
if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
|
||
|
|
else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
|
||
|
|
else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
|
||
|
|
else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
|
||
|
|
else
|
||
|
|
{
|
||
|
|
sscanf(hrec->vals[i],"%d",&num);
|
||
|
|
var = BCF_VL_FIXED;
|
||
|
|
}
|
||
|
|
if (var != BCF_VL_FIXED) num = 0xfffff;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
|
||
|
|
if (type == -1) {
|
||
|
|
hts_log_warning("%s %s field has no Type defined. Assuming String",
|
||
|
|
*hrec->key == 'I' ? "An" : "A", hrec->key);
|
||
|
|
type = BCF_HT_STR;
|
||
|
|
}
|
||
|
|
if (var == -1) {
|
||
|
|
hts_log_warning("%s %s field has no Number defined. Assuming '.'",
|
||
|
|
*hrec->key == 'I' ? "An" : "A", hrec->key);
|
||
|
|
var = BCF_VL_VAR;
|
||
|
|
}
|
||
|
|
if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
|
||
|
|
{
|
||
|
|
hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
|
||
|
|
var = BCF_VL_FIXED;
|
||
|
|
num = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
|
||
|
|
(var & 0xf) << 8 |
|
||
|
|
(type & 0xf) << 4 |
|
||
|
|
(((uint32_t) hrec->type) & 0xf));
|
||
|
|
|
||
|
|
if ( !id ) return 0;
|
||
|
|
str = strdup(id);
|
||
|
|
if (!str) return -1;
|
||
|
|
|
||
|
|
vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
|
||
|
|
k = kh_get(vdict, d, str);
|
||
|
|
if ( k != kh_end(d) )
|
||
|
|
{
|
||
|
|
// already present
|
||
|
|
free(str);
|
||
|
|
if ( kh_val(d, k).hrec[info&0xf] ) return 0;
|
||
|
|
kh_val(d, k).info[info&0xf] = info;
|
||
|
|
kh_val(d, k).hrec[info&0xf] = hrec;
|
||
|
|
if ( idx==-1 ) {
|
||
|
|
if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
k = kh_put(vdict, d, str, &ret);
|
||
|
|
if (ret < 0) {
|
||
|
|
free(str);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
kh_val(d, k) = bcf_idinfo_def;
|
||
|
|
kh_val(d, k).info[info&0xf] = info;
|
||
|
|
kh_val(d, k).hrec[info&0xf] = hrec;
|
||
|
|
kh_val(d, k).id = idx;
|
||
|
|
if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
|
||
|
|
kh_del(vdict, d, k);
|
||
|
|
free(str);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if ( idx==-1 ) {
|
||
|
|
if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
if (hrec->type == BCF_HL_FLT ||
|
||
|
|
hrec->type == BCF_HL_INFO ||
|
||
|
|
hrec->type == BCF_HL_FMT ||
|
||
|
|
hrec->type == BCF_HL_CTG) {
|
||
|
|
int id = bcf_hrec_find_key(hrec, "ID");
|
||
|
|
if (id < 0 || !hrec->vals[id])
|
||
|
|
return;
|
||
|
|
vdict_t *dict = (hrec->type == BCF_HL_CTG
|
||
|
|
? (vdict_t*)hdr->dict[BCF_DT_CTG]
|
||
|
|
: (vdict_t*)hdr->dict[BCF_DT_ID]);
|
||
|
|
khint_t k = kh_get(vdict, dict, hrec->vals[id]);
|
||
|
|
if (k != kh_end(dict))
|
||
|
|
kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
kstring_t str = KS_INITIALIZE;
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
|
||
|
|
khint_t k;
|
||
|
|
int id;
|
||
|
|
|
||
|
|
switch (hrec->type) {
|
||
|
|
case BCF_HL_GEN:
|
||
|
|
if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
|
||
|
|
str.l = 0;
|
||
|
|
break;
|
||
|
|
case BCF_HL_STR:
|
||
|
|
id = bcf_hrec_find_key(hrec, "ID");
|
||
|
|
if (id < 0)
|
||
|
|
return;
|
||
|
|
if (!hrec->vals[id] ||
|
||
|
|
ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
|
||
|
|
str.l = 0;
|
||
|
|
break;
|
||
|
|
default:
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (str.l) {
|
||
|
|
k = kh_get(hdict, aux->gen, str.s);
|
||
|
|
} else {
|
||
|
|
// Couldn't get a string for some reason, so try the hard way...
|
||
|
|
for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
|
||
|
|
if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
|
||
|
|
kh_val(aux->gen, k) = NULL;
|
||
|
|
free((char *) kh_key(aux->gen, k));
|
||
|
|
kh_key(aux->gen, k) = NULL;
|
||
|
|
kh_del(hdict, aux->gen, k);
|
||
|
|
}
|
||
|
|
free(str.s);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
|
||
|
|
{
|
||
|
|
// currently only for bcf_hdr_set_version
|
||
|
|
assert( hrec->type==BCF_HL_GEN );
|
||
|
|
int ret;
|
||
|
|
khint_t k;
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
|
||
|
|
for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
|
||
|
|
{
|
||
|
|
if ( !kh_exist(aux->gen,k) ) continue;
|
||
|
|
if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
assert( k<kh_end(aux->gen) ); // something went wrong, should never happen
|
||
|
|
free((char*)kh_key(aux->gen,k));
|
||
|
|
kh_del(hdict,aux->gen,k);
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
|
||
|
|
{
|
||
|
|
free(str.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
k = kh_put(hdict, aux->gen, str.s, &ret);
|
||
|
|
if ( ret<0 )
|
||
|
|
{
|
||
|
|
free(str.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
free(hrec->value);
|
||
|
|
hrec->value = strdup(tmp->value);
|
||
|
|
if ( !hrec->value ) return -1;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
|
||
|
|
{
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
|
||
|
|
|
||
|
|
int res;
|
||
|
|
if ( !hrec ) return 0;
|
||
|
|
|
||
|
|
bcf_hrec_check(hrec); // todo: check return status and propagate errors up
|
||
|
|
|
||
|
|
res = bcf_hdr_register_hrec(hdr,hrec);
|
||
|
|
if (res < 0) return -1;
|
||
|
|
if ( !res )
|
||
|
|
{
|
||
|
|
// If one of the hashed field, then it is already present
|
||
|
|
if ( hrec->type != BCF_HL_GEN )
|
||
|
|
{
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Is one of the generic fields and already present?
|
||
|
|
if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
|
||
|
|
{
|
||
|
|
free(str.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
khint_t k = kh_get(hdict, aux->gen, str.s);
|
||
|
|
if ( k != kh_end(aux->gen) )
|
||
|
|
{
|
||
|
|
// duplicate record
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
free(str.s);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
int i;
|
||
|
|
if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
|
||
|
|
{
|
||
|
|
if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
|
||
|
|
{
|
||
|
|
free(str.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
khint_t k = kh_get(hdict, aux->gen, str.s);
|
||
|
|
if ( k != kh_end(aux->gen) )
|
||
|
|
{
|
||
|
|
// duplicate record
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
free(str.s);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// New record, needs to be added
|
||
|
|
int n = hdr->nhrec + 1;
|
||
|
|
bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
|
||
|
|
if (!new_hrec) {
|
||
|
|
free(str.s);
|
||
|
|
bcf_hdr_unregister_hrec(hdr, hrec);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hdr->hrec = new_hrec;
|
||
|
|
|
||
|
|
if ( str.s )
|
||
|
|
{
|
||
|
|
khint_t k = kh_put(hdict, aux->gen, str.s, &res);
|
||
|
|
if ( res<0 )
|
||
|
|
{
|
||
|
|
free(str.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
kh_val(aux->gen,k) = hrec;
|
||
|
|
}
|
||
|
|
|
||
|
|
hdr->hrec[hdr->nhrec] = hrec;
|
||
|
|
hdr->dirty = 1;
|
||
|
|
hdr->nhrec = n;
|
||
|
|
|
||
|
|
return hrec->type==BCF_HL_GEN ? 0 : 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
if ( type==BCF_HL_GEN )
|
||
|
|
{
|
||
|
|
// e.g. ##fileformat=VCFv4.2
|
||
|
|
// ##source=GenomicsDBImport
|
||
|
|
// ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
|
||
|
|
if ( value )
|
||
|
|
{
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
ksprintf(&str, "##%s=%s", key,value);
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
|
||
|
|
khint_t k = kh_get(hdict, aux->gen, str.s);
|
||
|
|
free(str.s);
|
||
|
|
if ( k == kh_end(aux->gen) ) return NULL;
|
||
|
|
return kh_val(aux->gen, k);
|
||
|
|
}
|
||
|
|
for (i=0; i<hdr->nhrec; i++)
|
||
|
|
{
|
||
|
|
if ( hdr->hrec[i]->type!=type ) continue;
|
||
|
|
if ( strcmp(hdr->hrec[i]->key,key) ) continue;
|
||
|
|
return hdr->hrec[i];
|
||
|
|
}
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
else if ( type==BCF_HL_STR )
|
||
|
|
{
|
||
|
|
// e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
|
||
|
|
// ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
|
||
|
|
if (!str_class) return NULL;
|
||
|
|
if ( !strcmp("ID",key) )
|
||
|
|
{
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
|
||
|
|
khint_t k = kh_get(hdict, aux->gen, str.s);
|
||
|
|
free(str.s);
|
||
|
|
if ( k == kh_end(aux->gen) ) return NULL;
|
||
|
|
return kh_val(aux->gen, k);
|
||
|
|
}
|
||
|
|
for (i=0; i<hdr->nhrec; i++)
|
||
|
|
{
|
||
|
|
if ( hdr->hrec[i]->type!=type ) continue;
|
||
|
|
if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
|
||
|
|
int j = bcf_hrec_find_key(hdr->hrec[i],key);
|
||
|
|
if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
|
||
|
|
}
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
|
||
|
|
khint_t k = kh_get(vdict, d, value);
|
||
|
|
if ( k == kh_end(d) ) return NULL;
|
||
|
|
return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
|
||
|
|
{
|
||
|
|
static int PL_warned = 0, GL_warned = 0;
|
||
|
|
|
||
|
|
if ( !PL_warned )
|
||
|
|
{
|
||
|
|
int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
|
||
|
|
if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
|
||
|
|
{
|
||
|
|
hts_log_warning("PL should be declared as Number=G");
|
||
|
|
PL_warned = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( !GL_warned )
|
||
|
|
{
|
||
|
|
int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
|
||
|
|
if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
|
||
|
|
{
|
||
|
|
hts_log_warning("GL should be declared as Number=G");
|
||
|
|
GL_warned = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
|
||
|
|
{
|
||
|
|
int len, done = 0;
|
||
|
|
char *p = htxt;
|
||
|
|
|
||
|
|
// Check sanity: "fileformat" string must come as first
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
|
||
|
|
if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
|
||
|
|
hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
|
||
|
|
if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// The filter PASS must appear first in the dictionary
|
||
|
|
hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
|
||
|
|
if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Parse the whole header
|
||
|
|
do {
|
||
|
|
while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
|
||
|
|
if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
p += len;
|
||
|
|
}
|
||
|
|
assert(hrec == NULL);
|
||
|
|
if (len < 0) {
|
||
|
|
// len < 0 indicates out-of-memory, or similar error
|
||
|
|
hts_log_error("Could not parse header line: %s", strerror(errno));
|
||
|
|
return -1;
|
||
|
|
} else if (len > 0) {
|
||
|
|
// Bad header line. bcf_hdr_parse_line() will have logged it.
|
||
|
|
// Skip and try again on the next line (p + len will be the start
|
||
|
|
// of the next one).
|
||
|
|
p += len;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Next should be the sample line. If not, it was a malformed
|
||
|
|
// header, in which case print a warning and skip (many VCF
|
||
|
|
// operations do not really care about a few malformed lines).
|
||
|
|
// In the future we may want to add a strict mode that errors in
|
||
|
|
// this case.
|
||
|
|
if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
|
||
|
|
char *eol = strchr(p, '\n');
|
||
|
|
if (*p != '\0') {
|
||
|
|
char buffer[320];
|
||
|
|
hts_log_warning("Could not parse header line: %s",
|
||
|
|
hts_strprint(buffer, sizeof(buffer),
|
||
|
|
'"', p,
|
||
|
|
eol ? (eol - p) : SIZE_MAX));
|
||
|
|
}
|
||
|
|
if (eol) {
|
||
|
|
p = eol + 1; // Try from the next line.
|
||
|
|
} else {
|
||
|
|
done = -1; // No more lines left, give up.
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
done = 1; // Sample line found
|
||
|
|
}
|
||
|
|
} while (!done);
|
||
|
|
|
||
|
|
if (done < 0) {
|
||
|
|
// No sample line is fatal.
|
||
|
|
hts_log_error("Could not parse the header, sample line not found");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (bcf_hdr_parse_sample_line(hdr,p) < 0)
|
||
|
|
return -1;
|
||
|
|
if (bcf_hdr_sync(hdr) < 0)
|
||
|
|
return -1;
|
||
|
|
bcf_hdr_check_sanity(hdr);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
|
||
|
|
{
|
||
|
|
int len;
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
|
||
|
|
if ( !hrec ) return -1;
|
||
|
|
if (bcf_hdr_add_hrec(hdr, hrec) < 0)
|
||
|
|
return -1;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
|
||
|
|
{
|
||
|
|
int i = 0;
|
||
|
|
bcf_hrec_t *hrec;
|
||
|
|
if ( !key )
|
||
|
|
{
|
||
|
|
// no key, remove all entries of this type
|
||
|
|
while ( i<hdr->nhrec )
|
||
|
|
{
|
||
|
|
if ( hdr->hrec[i]->type!=type ) { i++; continue; }
|
||
|
|
hrec = hdr->hrec[i];
|
||
|
|
bcf_hdr_unregister_hrec(hdr, hrec);
|
||
|
|
bcf_hdr_remove_from_hdict(hdr, hrec);
|
||
|
|
hdr->dirty = 1;
|
||
|
|
hdr->nhrec--;
|
||
|
|
if ( i < hdr->nhrec )
|
||
|
|
memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
}
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
while (1)
|
||
|
|
{
|
||
|
|
if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
|
||
|
|
{
|
||
|
|
hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
|
||
|
|
if ( !hrec ) return;
|
||
|
|
|
||
|
|
for (i=0; i<hdr->nhrec; i++)
|
||
|
|
if ( hdr->hrec[i]==hrec ) break;
|
||
|
|
assert( i<hdr->nhrec );
|
||
|
|
|
||
|
|
vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
|
||
|
|
khint_t k = kh_get(vdict, d, key);
|
||
|
|
kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
for (i=0; i<hdr->nhrec; i++)
|
||
|
|
{
|
||
|
|
if ( hdr->hrec[i]->type!=type ) continue;
|
||
|
|
if ( type==BCF_HL_GEN )
|
||
|
|
{
|
||
|
|
if ( !strcmp(hdr->hrec[i]->key,key) ) break;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
|
||
|
|
int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
|
||
|
|
if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( i==hdr->nhrec ) return;
|
||
|
|
hrec = hdr->hrec[i];
|
||
|
|
bcf_hdr_remove_from_hdict(hdr, hrec);
|
||
|
|
}
|
||
|
|
|
||
|
|
hdr->nhrec--;
|
||
|
|
if ( i < hdr->nhrec )
|
||
|
|
memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
hdr->dirty = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
|
||
|
|
{
|
||
|
|
char tmp[256], *line = tmp;
|
||
|
|
va_list ap;
|
||
|
|
va_start(ap, fmt);
|
||
|
|
int n = vsnprintf(line, sizeof(tmp), fmt, ap);
|
||
|
|
va_end(ap);
|
||
|
|
|
||
|
|
if (n >= sizeof(tmp)) {
|
||
|
|
n++; // For trailing NUL
|
||
|
|
line = (char*)malloc(n);
|
||
|
|
if (!line)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
va_start(ap, fmt);
|
||
|
|
vsnprintf(line, n, fmt, ap);
|
||
|
|
va_end(ap);
|
||
|
|
}
|
||
|
|
|
||
|
|
int ret = bcf_hdr_append(hdr, line);
|
||
|
|
|
||
|
|
if (line != tmp) free(line);
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/**********************
|
||
|
|
*** BCF header I/O ***
|
||
|
|
**********************/
|
||
|
|
|
||
|
|
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
|
||
|
|
{
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
|
||
|
|
if ( !hrec )
|
||
|
|
{
|
||
|
|
hts_log_warning("No version string found, assuming VCFv4.2");
|
||
|
|
return "VCFv4.2";
|
||
|
|
}
|
||
|
|
return hrec->value;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
|
||
|
|
{
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
|
||
|
|
if ( !hrec )
|
||
|
|
{
|
||
|
|
int len;
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
|
||
|
|
hrec = bcf_hdr_parse_line(hdr, str.s, &len);
|
||
|
|
free(str.s);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
|
||
|
|
if ( !tmp ) return -1;
|
||
|
|
free(tmp->value);
|
||
|
|
tmp->value = strdup(version);
|
||
|
|
if ( !tmp->value ) return -1;
|
||
|
|
bcf_hdr_update_hrec(hdr, hrec, tmp);
|
||
|
|
bcf_hrec_destroy(tmp);
|
||
|
|
}
|
||
|
|
hdr->dirty = 1;
|
||
|
|
return 0; // FIXME: check for errs in this function (return < 0 if so)
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_t *bcf_hdr_init(const char *mode)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
bcf_hdr_t *h;
|
||
|
|
h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
|
||
|
|
if (!h) return NULL;
|
||
|
|
for (i = 0; i < 3; ++i) {
|
||
|
|
if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
|
||
|
|
// Supersize the hash to make collisions very unlikely
|
||
|
|
static int dsize[3] = {16384,16384,2048}; // info, contig, format
|
||
|
|
if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
|
||
|
|
if ( !aux ) goto fail;
|
||
|
|
if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
|
||
|
|
aux->key_len = NULL;
|
||
|
|
aux->dict = *((vdict_t*)h->dict[0]);
|
||
|
|
free(h->dict[0]);
|
||
|
|
h->dict[0] = aux;
|
||
|
|
|
||
|
|
if ( strchr(mode,'w') )
|
||
|
|
{
|
||
|
|
bcf_hdr_append(h, "##fileformat=VCFv4.2");
|
||
|
|
// The filter PASS must appear first in the dictionary
|
||
|
|
bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
|
||
|
|
}
|
||
|
|
return h;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
for (i = 0; i < 3; ++i)
|
||
|
|
kh_destroy(vdict, h->dict[i]);
|
||
|
|
free(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_hdr_destroy(bcf_hdr_t *h)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
khint_t k;
|
||
|
|
if (!h) return;
|
||
|
|
for (i = 0; i < 3; ++i) {
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[i];
|
||
|
|
if (d == 0) continue;
|
||
|
|
for (k = kh_begin(d); k != kh_end(d); ++k)
|
||
|
|
if (kh_exist(d, k)) free((char*)kh_key(d, k));
|
||
|
|
if ( i==0 )
|
||
|
|
{
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(h);
|
||
|
|
for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
|
||
|
|
if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
|
||
|
|
kh_destroy(hdict, aux->gen);
|
||
|
|
free(aux->key_len); // may exist for dict[0] only
|
||
|
|
}
|
||
|
|
kh_destroy(vdict, d);
|
||
|
|
free(h->id[i]);
|
||
|
|
}
|
||
|
|
for (i=0; i<h->nhrec; i++)
|
||
|
|
bcf_hrec_destroy(h->hrec[i]);
|
||
|
|
if (h->nhrec) free(h->hrec);
|
||
|
|
if (h->samples) free(h->samples);
|
||
|
|
free(h->keep_samples);
|
||
|
|
free(h->transl[0]); free(h->transl[1]);
|
||
|
|
free(h->mem.s);
|
||
|
|
free(h);
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
|
||
|
|
{
|
||
|
|
if (hfp->format.format == vcf)
|
||
|
|
return vcf_hdr_read(hfp);
|
||
|
|
if (hfp->format.format != bcf) {
|
||
|
|
hts_log_error("Input is not detected as bcf or vcf format");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
assert(hfp->is_bgzf);
|
||
|
|
|
||
|
|
BGZF *fp = hfp->fp.bgzf;
|
||
|
|
uint8_t magic[5];
|
||
|
|
bcf_hdr_t *h;
|
||
|
|
h = bcf_hdr_init("r");
|
||
|
|
if (!h) {
|
||
|
|
hts_log_error("Failed to allocate bcf header");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
if (bgzf_read(fp, magic, 5) != 5)
|
||
|
|
{
|
||
|
|
hts_log_error("Failed to read the header (reading BCF in text mode?)");
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
|
||
|
|
{
|
||
|
|
if (!strncmp((char*)magic, "BCF", 3))
|
||
|
|
hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
|
||
|
|
else
|
||
|
|
hts_log_error("Invalid BCF2 magic string");
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
uint8_t buf[4];
|
||
|
|
size_t hlen;
|
||
|
|
char *htxt = NULL;
|
||
|
|
if (bgzf_read(fp, buf, 4) != 4) goto fail;
|
||
|
|
hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
|
||
|
|
if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
|
||
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||
|
|
if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
|
||
|
|
#endif
|
||
|
|
htxt = (char*)malloc(hlen + 1);
|
||
|
|
if (!htxt) goto fail;
|
||
|
|
if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
|
||
|
|
htxt[hlen] = '\0'; // Ensure htxt is terminated
|
||
|
|
if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
|
||
|
|
free(htxt);
|
||
|
|
return h;
|
||
|
|
fail:
|
||
|
|
hts_log_error("Failed to read BCF header");
|
||
|
|
free(htxt);
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
|
||
|
|
{
|
||
|
|
if (!h) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if ( h->dirty ) {
|
||
|
|
if (bcf_hdr_sync(h) < 0) return -1;
|
||
|
|
}
|
||
|
|
hfp->format.category = variant_data;
|
||
|
|
if (hfp->format.format == vcf || hfp->format.format == text_format) {
|
||
|
|
hfp->format.format = vcf;
|
||
|
|
return vcf_hdr_write(hfp, h);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (hfp->format.format == binary_format)
|
||
|
|
hfp->format.format = bcf;
|
||
|
|
|
||
|
|
kstring_t htxt = {0,0,0};
|
||
|
|
if (bcf_hdr_format(h, 1, &htxt) < 0) {
|
||
|
|
free(htxt.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
kputc('\0', &htxt); // include the \0 byte
|
||
|
|
|
||
|
|
BGZF *fp = hfp->fp.bgzf;
|
||
|
|
if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
|
||
|
|
uint8_t hlen[4];
|
||
|
|
u32_to_le(htxt.l, hlen);
|
||
|
|
if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
|
||
|
|
if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
|
||
|
|
if ( bgzf_flush(fp) < 0) return -1;
|
||
|
|
|
||
|
|
free(htxt.s);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/********************
|
||
|
|
*** BCF site I/O ***
|
||
|
|
********************/
|
||
|
|
|
||
|
|
bcf1_t *bcf_init(void)
|
||
|
|
{
|
||
|
|
bcf1_t *v;
|
||
|
|
v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
|
||
|
|
return v;
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_clear(bcf1_t *v)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for (i=0; i<v->d.m_info; i++)
|
||
|
|
{
|
||
|
|
if ( v->d.info[i].vptr_free )
|
||
|
|
{
|
||
|
|
free(v->d.info[i].vptr - v->d.info[i].vptr_off);
|
||
|
|
v->d.info[i].vptr_free = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for (i=0; i<v->d.m_fmt; i++)
|
||
|
|
{
|
||
|
|
if ( v->d.fmt[i].p_free )
|
||
|
|
{
|
||
|
|
free(v->d.fmt[i].p - v->d.fmt[i].p_off);
|
||
|
|
v->d.fmt[i].p_free = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
v->rid = v->pos = v->rlen = v->unpacked = 0;
|
||
|
|
bcf_float_set_missing(v->qual);
|
||
|
|
v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
|
||
|
|
v->shared.l = v->indiv.l = 0;
|
||
|
|
v->d.var_type = -1;
|
||
|
|
v->d.shared_dirty = 0;
|
||
|
|
v->d.indiv_dirty = 0;
|
||
|
|
v->d.n_flt = 0;
|
||
|
|
v->errcode = 0;
|
||
|
|
if (v->d.m_als) v->d.als[0] = 0;
|
||
|
|
if (v->d.m_id) v->d.id[0] = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_empty(bcf1_t *v)
|
||
|
|
{
|
||
|
|
bcf_clear1(v);
|
||
|
|
free(v->d.id);
|
||
|
|
free(v->d.als);
|
||
|
|
free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
|
||
|
|
if (v->d.var ) free(v->d.var);
|
||
|
|
free(v->shared.s); free(v->indiv.s);
|
||
|
|
memset(&v->d,0,sizeof(v->d));
|
||
|
|
memset(&v->shared,0,sizeof(v->shared));
|
||
|
|
memset(&v->indiv,0,sizeof(v->indiv));
|
||
|
|
}
|
||
|
|
|
||
|
|
void bcf_destroy(bcf1_t *v)
|
||
|
|
{
|
||
|
|
if (!v) return;
|
||
|
|
bcf_empty1(v);
|
||
|
|
free(v);
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
|
||
|
|
{
|
||
|
|
uint8_t x[32];
|
||
|
|
ssize_t ret;
|
||
|
|
uint32_t shared_len, indiv_len;
|
||
|
|
if ((ret = bgzf_read(fp, x, 32)) != 32) {
|
||
|
|
if (ret == 0) return -1;
|
||
|
|
return -2;
|
||
|
|
}
|
||
|
|
bcf_clear1(v);
|
||
|
|
shared_len = le_to_u32(x);
|
||
|
|
if (shared_len < 24) return -2;
|
||
|
|
shared_len -= 24; // to exclude six 32-bit integers
|
||
|
|
indiv_len = le_to_u32(x + 4);
|
||
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||
|
|
// ks_resize() normally allocates 1.5 * requested size to allow for growth
|
||
|
|
if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
|
||
|
|
#endif
|
||
|
|
if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
|
||
|
|
if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
|
||
|
|
v->rid = le_to_i32(x + 8);
|
||
|
|
v->pos = le_to_u32(x + 12);
|
||
|
|
if ( v->pos==UINT32_MAX ) v->pos = -1; // this is for telomere coordinate, e.g. MT:0
|
||
|
|
v->rlen = le_to_i32(x + 16);
|
||
|
|
v->qual = le_to_float(x + 20);
|
||
|
|
v->n_info = le_to_u16(x + 24);
|
||
|
|
v->n_allele = le_to_u16(x + 26);
|
||
|
|
v->n_sample = le_to_u32(x + 28) & 0xffffff;
|
||
|
|
v->n_fmt = x[31];
|
||
|
|
v->shared.l = shared_len;
|
||
|
|
v->indiv.l = indiv_len;
|
||
|
|
// silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
|
||
|
|
if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
|
||
|
|
|
||
|
|
if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
|
||
|
|
if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
#define bit_array_size(n) ((n)/8+1)
|
||
|
|
#define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
|
||
|
|
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
|
||
|
|
#define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
|
||
|
|
|
||
|
|
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
|
||
|
|
int32_t *val) {
|
||
|
|
uint32_t t;
|
||
|
|
if (end - p < 2) return -1;
|
||
|
|
t = *p++ & 0xf;
|
||
|
|
/* Use if .. else if ... else instead of switch to force order. Assumption
|
||
|
|
is that small integers are more frequent than big ones. */
|
||
|
|
if (t == BCF_BT_INT8) {
|
||
|
|
*val = *(int8_t *) p++;
|
||
|
|
} else {
|
||
|
|
if (end - p < (1<<bcf_type_shift[t])) return -1;
|
||
|
|
if (t == BCF_BT_INT16) {
|
||
|
|
*val = le_to_i16(p);
|
||
|
|
p += 2;
|
||
|
|
} else if (t == BCF_BT_INT32) {
|
||
|
|
*val = le_to_i32(p);
|
||
|
|
p += 4;
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
} else if (t == BCF_BT_INT64) {
|
||
|
|
// This case should never happen because there should be no
|
||
|
|
// 64-bit BCFs at all, definitely not coming from htslib
|
||
|
|
*val = le_to_i64(p);
|
||
|
|
p += 8;
|
||
|
|
#endif
|
||
|
|
} else {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*q = p;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
|
||
|
|
int *num, int *type) {
|
||
|
|
int r;
|
||
|
|
if (p >= end) return -1;
|
||
|
|
*type = *p & 0xf;
|
||
|
|
if (*p>>4 != 15) {
|
||
|
|
*q = p + 1;
|
||
|
|
*num = *p >> 4;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
|
||
|
|
if (r) return r;
|
||
|
|
return *num >= 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static const char *get_type_name(int type) {
|
||
|
|
const char *types[9] = {
|
||
|
|
"null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
|
||
|
|
"unknown", "float", "unknown", "char", "unknown"
|
||
|
|
};
|
||
|
|
int t = (type >= 0 && type < 8) ? type : 8;
|
||
|
|
return types[t];
|
||
|
|
}
|
||
|
|
|
||
|
|
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
|
||
|
|
char *type, uint32_t *reports, int i) {
|
||
|
|
if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
|
||
|
|
": Invalid FORMAT %s %d",
|
||
|
|
bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
|
||
|
|
(*reports)++;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
|
||
|
|
uint8_t *ptr, *end;
|
||
|
|
size_t bytes;
|
||
|
|
uint32_t err = 0;
|
||
|
|
int type = 0;
|
||
|
|
int num = 0;
|
||
|
|
int reflen = 0;
|
||
|
|
uint32_t i, reports;
|
||
|
|
const uint32_t is_integer = ((1 << BCF_BT_INT8) |
|
||
|
|
(1 << BCF_BT_INT16) |
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
(1 << BCF_BT_INT64) |
|
||
|
|
#endif
|
||
|
|
(1 << BCF_BT_INT32));
|
||
|
|
const uint32_t is_valid_type = (is_integer |
|
||
|
|
(1 << BCF_BT_NULL) |
|
||
|
|
(1 << BCF_BT_FLOAT) |
|
||
|
|
(1 << BCF_BT_CHAR));
|
||
|
|
int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
|
||
|
|
|
||
|
|
// Check for valid contig ID
|
||
|
|
if (rec->rid < 0
|
||
|
|
|| (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
|
||
|
|
|| hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
|
||
|
|
hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
|
||
|
|
err |= BCF_ERR_CTG_INVALID;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check ID
|
||
|
|
ptr = (uint8_t *) rec->shared.s;
|
||
|
|
end = ptr + rec->shared.l;
|
||
|
|
if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
|
||
|
|
if (type != BCF_BT_CHAR) {
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
|
||
|
|
err |= BCF_ERR_TAG_INVALID;
|
||
|
|
}
|
||
|
|
bytes = (size_t) num << bcf_type_shift[type];
|
||
|
|
if (end - ptr < bytes) goto bad_shared;
|
||
|
|
ptr += bytes;
|
||
|
|
|
||
|
|
// Check REF and ALT
|
||
|
|
if (rec->n_allele < 1) {
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
|
||
|
|
bcf_seqname_safe(hdr,rec), rec->pos+1);
|
||
|
|
err |= BCF_ERR_TAG_UNDEF;
|
||
|
|
}
|
||
|
|
|
||
|
|
reports = 0;
|
||
|
|
for (i = 0; i < rec->n_allele; i++) {
|
||
|
|
if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
|
||
|
|
if (type != BCF_BT_CHAR) {
|
||
|
|
if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
|
||
|
|
err |= BCF_ERR_CHAR;
|
||
|
|
}
|
||
|
|
if (i == 0) reflen = num;
|
||
|
|
bytes = (size_t) num << bcf_type_shift[type];
|
||
|
|
if (end - ptr < bytes) goto bad_shared;
|
||
|
|
ptr += bytes;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check FILTER
|
||
|
|
reports = 0;
|
||
|
|
if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
|
||
|
|
if (num > 0) {
|
||
|
|
bytes = (size_t) num << bcf_type_shift[type];
|
||
|
|
if (((1 << type) & is_integer) == 0) {
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
|
||
|
|
err |= BCF_ERR_TAG_INVALID;
|
||
|
|
if (end - ptr < bytes) goto bad_shared;
|
||
|
|
ptr += bytes;
|
||
|
|
} else {
|
||
|
|
if (end - ptr < bytes) goto bad_shared;
|
||
|
|
for (i = 0; i < num; i++) {
|
||
|
|
int32_t key = bcf_dec_int1(ptr, type, &ptr);
|
||
|
|
if (key < 0
|
||
|
|
|| (hdr && (key >= max_id
|
||
|
|
|| hdr->id[BCF_DT_ID][key].key == NULL))) {
|
||
|
|
if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
|
||
|
|
err |= BCF_ERR_TAG_UNDEF;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check INFO
|
||
|
|
reports = 0;
|
||
|
|
bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
|
||
|
|
for (i = 0; i < rec->n_info; i++) {
|
||
|
|
int32_t key = -1;
|
||
|
|
if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
|
||
|
|
if (key < 0 || (hdr && (key >= max_id
|
||
|
|
|| id_tmp[key].key == NULL))) {
|
||
|
|
if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
|
||
|
|
err |= BCF_ERR_TAG_UNDEF;
|
||
|
|
}
|
||
|
|
if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
|
||
|
|
if (((1 << type) & is_valid_type) == 0
|
||
|
|
|| (type == BCF_BT_NULL && num > 0)) {
|
||
|
|
if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
|
||
|
|
hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
|
||
|
|
err |= BCF_ERR_TAG_INVALID;
|
||
|
|
}
|
||
|
|
bytes = (size_t) num << bcf_type_shift[type];
|
||
|
|
if (end - ptr < bytes) goto bad_shared;
|
||
|
|
ptr += bytes;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check FORMAT and individual information
|
||
|
|
ptr = (uint8_t *) rec->indiv.s;
|
||
|
|
end = ptr + rec->indiv.l;
|
||
|
|
reports = 0;
|
||
|
|
for (i = 0; i < rec->n_fmt; i++) {
|
||
|
|
int32_t key = -1;
|
||
|
|
if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
|
||
|
|
if (key < 0
|
||
|
|
|| (hdr && (key >= max_id
|
||
|
|
|| id_tmp[key].key == NULL))) {
|
||
|
|
bcf_record_check_err(hdr, rec, "id", &reports, key);
|
||
|
|
err |= BCF_ERR_TAG_UNDEF;
|
||
|
|
}
|
||
|
|
if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
|
||
|
|
if (((1 << type) & is_valid_type) == 0
|
||
|
|
|| (type == BCF_BT_NULL && num > 0)) {
|
||
|
|
bcf_record_check_err(hdr, rec, "type", &reports, type);
|
||
|
|
err |= BCF_ERR_TAG_INVALID;
|
||
|
|
}
|
||
|
|
bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
|
||
|
|
if (end - ptr < bytes) goto bad_indiv;
|
||
|
|
ptr += bytes;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!err && rec->rlen < 0) {
|
||
|
|
// Treat bad rlen as a warning instead of an error, and try to
|
||
|
|
// fix up by using the length of the stored REF allele.
|
||
|
|
static int warned = 0;
|
||
|
|
if (!warned) {
|
||
|
|
hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
|
||
|
|
"Only one invalid RLEN will be reported.",
|
||
|
|
bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
|
||
|
|
warned = 1;
|
||
|
|
}
|
||
|
|
rec->rlen = reflen >= 0 ? reflen : 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
rec->errcode |= err;
|
||
|
|
|
||
|
|
return err ? -2 : 0; // Return -2 so bcf_read() reports an error
|
||
|
|
|
||
|
|
bad_shared:
|
||
|
|
hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
|
||
|
|
return -2;
|
||
|
|
|
||
|
|
bad_indiv:
|
||
|
|
hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
|
||
|
|
return -2;
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
|
||
|
|
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
|
||
|
|
{
|
||
|
|
if ( !hdr->keep_samples ) return 0;
|
||
|
|
if ( !bcf_hdr_nsamples(hdr) )
|
||
|
|
{
|
||
|
|
rec->indiv.l = rec->n_sample = 0;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int i, j;
|
||
|
|
uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
|
||
|
|
bcf_dec_t *dec = &rec->d;
|
||
|
|
hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
|
||
|
|
for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
|
||
|
|
|
||
|
|
for (i=0; i<rec->n_fmt; i++)
|
||
|
|
{
|
||
|
|
ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
|
||
|
|
src = dec->fmt[i].p - dec->fmt[i].size;
|
||
|
|
if ( dst )
|
||
|
|
{
|
||
|
|
memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
|
||
|
|
dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
|
||
|
|
}
|
||
|
|
dst = dec->fmt[i].p;
|
||
|
|
for (j=0; j<hdr->nsamples_ori; j++)
|
||
|
|
{
|
||
|
|
src += dec->fmt[i].size;
|
||
|
|
if ( !bit_array_test(hdr->keep_samples,j) ) continue;
|
||
|
|
memmove(dst, src, dec->fmt[i].size);
|
||
|
|
dst += dec->fmt[i].size;
|
||
|
|
}
|
||
|
|
rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
|
||
|
|
dec->fmt[i].p_len = dst - dec->fmt[i].p;
|
||
|
|
}
|
||
|
|
rec->unpacked |= BCF_UN_FMT;
|
||
|
|
|
||
|
|
rec->n_sample = bcf_hdr_nsamples(hdr);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
|
||
|
|
{
|
||
|
|
if (fp->format.format == vcf) return vcf_read(fp,h,v);
|
||
|
|
int ret = bcf_read1_core(fp->fp.bgzf, v);
|
||
|
|
if (ret == 0) ret = bcf_record_check(h, v);
|
||
|
|
if ( ret!=0 || !h->keep_samples ) return ret;
|
||
|
|
return bcf_subset_format(h,v);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
|
||
|
|
{
|
||
|
|
bcf1_t *v = (bcf1_t *) vv;
|
||
|
|
int ret = bcf_read1_core(fp, v);
|
||
|
|
if (ret == 0) ret = bcf_record_check(NULL, v);
|
||
|
|
if (ret >= 0)
|
||
|
|
*tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
|
||
|
|
{
|
||
|
|
// single typed string
|
||
|
|
if ( line->d.id && strcmp(line->d.id, ".") ) {
|
||
|
|
return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
|
||
|
|
} else {
|
||
|
|
return bcf_enc_size(str, 0, BCF_BT_CHAR);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
|
||
|
|
{
|
||
|
|
// list of typed strings
|
||
|
|
int i;
|
||
|
|
for (i=0; i<line->n_allele; i++) {
|
||
|
|
if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
|
||
|
|
{
|
||
|
|
// typed vector of integers
|
||
|
|
if ( line->d.n_flt ) {
|
||
|
|
return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
|
||
|
|
} else {
|
||
|
|
return bcf_enc_vint(str, 0, 0, -1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
|
||
|
|
{
|
||
|
|
// pairs of typed vectors
|
||
|
|
int i, irm = -1, e = 0;
|
||
|
|
for (i=0; i<line->n_info; i++)
|
||
|
|
{
|
||
|
|
bcf_info_t *info = &line->d.info[i];
|
||
|
|
if ( !info->vptr )
|
||
|
|
{
|
||
|
|
// marked for removal
|
||
|
|
if ( irm < 0 ) irm = i;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
|
||
|
|
if ( irm >=0 )
|
||
|
|
{
|
||
|
|
bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
|
||
|
|
while ( irm<=i && line->d.info[irm].vptr ) irm++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( irm>=0 ) line->n_info = irm;
|
||
|
|
return e == 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int bcf1_sync(bcf1_t *line)
|
||
|
|
{
|
||
|
|
char *shared_ori = line->shared.s;
|
||
|
|
size_t prev_len;
|
||
|
|
|
||
|
|
kstring_t tmp = {0,0,0};
|
||
|
|
if ( !line->shared.l )
|
||
|
|
{
|
||
|
|
// New line created via API, BCF data blocks do not exist. Get it ready for BCF output
|
||
|
|
tmp = line->shared;
|
||
|
|
bcf1_sync_id(line, &tmp);
|
||
|
|
line->unpack_size[0] = tmp.l; prev_len = tmp.l;
|
||
|
|
|
||
|
|
bcf1_sync_alleles(line, &tmp);
|
||
|
|
line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
|
||
|
|
|
||
|
|
bcf1_sync_filter(line, &tmp);
|
||
|
|
line->unpack_size[2] = tmp.l - prev_len;
|
||
|
|
|
||
|
|
bcf1_sync_info(line, &tmp);
|
||
|
|
line->shared = tmp;
|
||
|
|
}
|
||
|
|
else if ( line->d.shared_dirty )
|
||
|
|
{
|
||
|
|
// The line was edited, update the BCF data block.
|
||
|
|
|
||
|
|
if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
|
||
|
|
|
||
|
|
// ptr_ori points to the original unchanged BCF data.
|
||
|
|
uint8_t *ptr_ori = (uint8_t *) line->shared.s;
|
||
|
|
|
||
|
|
// ID: single typed string
|
||
|
|
if ( line->d.shared_dirty & BCF1_DIRTY_ID )
|
||
|
|
bcf1_sync_id(line, &tmp);
|
||
|
|
else
|
||
|
|
kputsn_(ptr_ori, line->unpack_size[0], &tmp);
|
||
|
|
ptr_ori += line->unpack_size[0];
|
||
|
|
line->unpack_size[0] = tmp.l; prev_len = tmp.l;
|
||
|
|
|
||
|
|
// REF+ALT: list of typed strings
|
||
|
|
if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
|
||
|
|
bcf1_sync_alleles(line, &tmp);
|
||
|
|
else
|
||
|
|
{
|
||
|
|
kputsn_(ptr_ori, line->unpack_size[1], &tmp);
|
||
|
|
if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
|
||
|
|
}
|
||
|
|
ptr_ori += line->unpack_size[1];
|
||
|
|
line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
|
||
|
|
|
||
|
|
if ( line->unpacked & BCF_UN_FLT )
|
||
|
|
{
|
||
|
|
// FILTER: typed vector of integers
|
||
|
|
if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
|
||
|
|
bcf1_sync_filter(line, &tmp);
|
||
|
|
else if ( line->d.n_flt )
|
||
|
|
kputsn_(ptr_ori, line->unpack_size[2], &tmp);
|
||
|
|
else
|
||
|
|
bcf_enc_vint(&tmp, 0, 0, -1);
|
||
|
|
ptr_ori += line->unpack_size[2];
|
||
|
|
line->unpack_size[2] = tmp.l - prev_len;
|
||
|
|
|
||
|
|
if ( line->unpacked & BCF_UN_INFO )
|
||
|
|
{
|
||
|
|
// INFO: pairs of typed vectors
|
||
|
|
if ( line->d.shared_dirty & BCF1_DIRTY_INF )
|
||
|
|
{
|
||
|
|
bcf1_sync_info(line, &tmp);
|
||
|
|
ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
|
||
|
|
if ( size ) kputsn_(ptr_ori, size, &tmp);
|
||
|
|
|
||
|
|
free(line->shared.s);
|
||
|
|
line->shared = tmp;
|
||
|
|
}
|
||
|
|
if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
|
||
|
|
{
|
||
|
|
// Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
|
||
|
|
size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
|
||
|
|
int i;
|
||
|
|
for (i=0; i<line->n_info; i++)
|
||
|
|
{
|
||
|
|
uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
|
||
|
|
line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
|
||
|
|
off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
|
||
|
|
if ( vptr_free )
|
||
|
|
{
|
||
|
|
free(vptr_free);
|
||
|
|
line->d.info[i].vptr_free = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
|
||
|
|
{
|
||
|
|
// The genotype fields changed or are not present
|
||
|
|
tmp.l = tmp.m = 0; tmp.s = NULL;
|
||
|
|
int i, irm = -1;
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
{
|
||
|
|
bcf_fmt_t *fmt = &line->d.fmt[i];
|
||
|
|
if ( !fmt->p )
|
||
|
|
{
|
||
|
|
// marked for removal
|
||
|
|
if ( irm < 0 ) irm = i;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
|
||
|
|
if ( irm >=0 )
|
||
|
|
{
|
||
|
|
bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
|
||
|
|
while ( irm<=i && line->d.fmt[irm].p ) irm++;
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
if ( irm>=0 ) line->n_fmt = irm;
|
||
|
|
free(line->indiv.s);
|
||
|
|
line->indiv = tmp;
|
||
|
|
|
||
|
|
// Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
|
||
|
|
size_t off_new = 0;
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
{
|
||
|
|
uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
|
||
|
|
line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
|
||
|
|
off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
|
||
|
|
if ( p_free )
|
||
|
|
{
|
||
|
|
free(p_free);
|
||
|
|
line->d.fmt[i].p_free = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( !line->n_sample ) line->n_fmt = 0;
|
||
|
|
line->d.shared_dirty = line->d.indiv_dirty = 0;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
|
||
|
|
{
|
||
|
|
bcf1_sync(src);
|
||
|
|
|
||
|
|
bcf_clear(dst);
|
||
|
|
dst->rid = src->rid;
|
||
|
|
dst->pos = src->pos;
|
||
|
|
dst->rlen = src->rlen;
|
||
|
|
dst->qual = src->qual;
|
||
|
|
dst->n_info = src->n_info; dst->n_allele = src->n_allele;
|
||
|
|
dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
|
||
|
|
|
||
|
|
if ( dst->shared.m < src->shared.l )
|
||
|
|
{
|
||
|
|
dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
|
||
|
|
dst->shared.m = src->shared.l;
|
||
|
|
}
|
||
|
|
dst->shared.l = src->shared.l;
|
||
|
|
memcpy(dst->shared.s,src->shared.s,dst->shared.l);
|
||
|
|
|
||
|
|
if ( dst->indiv.m < src->indiv.l )
|
||
|
|
{
|
||
|
|
dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
|
||
|
|
dst->indiv.m = src->indiv.l;
|
||
|
|
}
|
||
|
|
dst->indiv.l = src->indiv.l;
|
||
|
|
memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
|
||
|
|
|
||
|
|
return dst;
|
||
|
|
}
|
||
|
|
bcf1_t *bcf_dup(bcf1_t *src)
|
||
|
|
{
|
||
|
|
bcf1_t *out = bcf_init1();
|
||
|
|
return bcf_copy(out, src);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
|
||
|
|
{
|
||
|
|
if ( h->dirty ) {
|
||
|
|
if (bcf_hdr_sync(h) < 0) return -1;
|
||
|
|
}
|
||
|
|
if ( bcf_hdr_nsamples(h)!=v->n_sample )
|
||
|
|
{
|
||
|
|
hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
|
||
|
|
bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( hfp->format.format == vcf || hfp->format.format == text_format )
|
||
|
|
return vcf_write(hfp,h,v);
|
||
|
|
|
||
|
|
if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
|
||
|
|
{
|
||
|
|
// vcf_parse1() encountered a new contig or tag, undeclared in the
|
||
|
|
// header. At this point, the header must have been printed,
|
||
|
|
// proceeding would lead to a broken BCF file. Errors must be checked
|
||
|
|
// and cleared by the caller before we can proceed.
|
||
|
|
char errdescription[1024] = "";
|
||
|
|
hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
bcf1_sync(v); // check if the BCF record was modified
|
||
|
|
|
||
|
|
if ( v->unpacked & BCF_IS_64BIT )
|
||
|
|
{
|
||
|
|
hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
BGZF *fp = hfp->fp.bgzf;
|
||
|
|
uint8_t x[32];
|
||
|
|
u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
|
||
|
|
u32_to_le(v->indiv.l, x + 4);
|
||
|
|
i32_to_le(v->rid, x + 8);
|
||
|
|
u32_to_le(v->pos, x + 12);
|
||
|
|
u32_to_le(v->rlen, x + 16);
|
||
|
|
float_to_le(v->qual, x + 20);
|
||
|
|
u16_to_le(v->n_info, x + 24);
|
||
|
|
u16_to_le(v->n_allele, x + 26);
|
||
|
|
u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
|
||
|
|
if ( bgzf_write(fp, x, 32) != 32 ) return -1;
|
||
|
|
if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
|
||
|
|
if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
|
||
|
|
|
||
|
|
if (hfp->idx) {
|
||
|
|
if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
|
||
|
|
bgzf_tell(fp), 1) < 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**********************
|
||
|
|
*** VCF header I/O ***
|
||
|
|
**********************/
|
||
|
|
|
||
|
|
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
|
||
|
|
bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
|
||
|
|
int save_errno;
|
||
|
|
if (!hrec) goto fail;
|
||
|
|
|
||
|
|
hrec->key = strdup("contig");
|
||
|
|
if (!hrec->key) goto fail;
|
||
|
|
|
||
|
|
if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
|
||
|
|
if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
|
||
|
|
goto fail;
|
||
|
|
if (bcf_hdr_add_hrec(h, hrec) < 0)
|
||
|
|
goto fail;
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
save_errno = errno;
|
||
|
|
hts_log_error("%s", strerror(errno));
|
||
|
|
if (hrec) bcf_hrec_destroy(hrec);
|
||
|
|
errno = save_errno;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
|
||
|
|
{
|
||
|
|
kstring_t txt, *s = &fp->line;
|
||
|
|
int ret;
|
||
|
|
bcf_hdr_t *h;
|
||
|
|
tbx_t *idx = NULL;
|
||
|
|
const char **names = NULL;
|
||
|
|
h = bcf_hdr_init("r");
|
||
|
|
if (!h) {
|
||
|
|
hts_log_error("Failed to allocate bcf header");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
txt.l = txt.m = 0; txt.s = 0;
|
||
|
|
while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
|
||
|
|
int e = 0;
|
||
|
|
if (s->l == 0) continue;
|
||
|
|
if (s->s[0] != '#') {
|
||
|
|
hts_log_error("No sample line");
|
||
|
|
goto error;
|
||
|
|
}
|
||
|
|
if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
|
||
|
|
kstring_t tmp = { 0, 0, NULL };
|
||
|
|
hFILE *f = hopen(fp->fn_aux, "r");
|
||
|
|
if (f == NULL) {
|
||
|
|
hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
|
||
|
|
goto error;
|
||
|
|
}
|
||
|
|
while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
|
||
|
|
char *tab = strchr(tmp.s, '\t');
|
||
|
|
if (tab == NULL) continue;
|
||
|
|
e |= (kputs("##contig=<ID=", &txt) < 0);
|
||
|
|
e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
|
||
|
|
e |= (kputs(",length=", &txt) < 0);
|
||
|
|
e |= (kputl(atol(tab), &txt) < 0);
|
||
|
|
e |= (kputsn(">\n", 2, &txt) < 0);
|
||
|
|
}
|
||
|
|
free(tmp.s);
|
||
|
|
if (hclose(f) != 0) {
|
||
|
|
hts_log_error("Error on closing %s", fp->fn_aux);
|
||
|
|
goto error;
|
||
|
|
}
|
||
|
|
if (e) goto error;
|
||
|
|
}
|
||
|
|
if (kputsn(s->s, s->l, &txt) < 0) goto error;
|
||
|
|
if (kputc('\n', &txt) < 0) goto error;
|
||
|
|
if (s->s[1] != '#') break;
|
||
|
|
}
|
||
|
|
if ( ret < -1 ) goto error;
|
||
|
|
if ( !txt.s )
|
||
|
|
{
|
||
|
|
hts_log_error("Could not read the header");
|
||
|
|
goto error;
|
||
|
|
}
|
||
|
|
if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
|
||
|
|
|
||
|
|
// check tabix index, are all contigs listed in the header? add the missing ones
|
||
|
|
idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
|
||
|
|
if ( idx )
|
||
|
|
{
|
||
|
|
int i, n, need_sync = 0;
|
||
|
|
names = tbx_seqnames(idx, &n);
|
||
|
|
if (!names) goto error;
|
||
|
|
for (i=0; i<n; i++)
|
||
|
|
{
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
|
||
|
|
if ( hrec ) continue;
|
||
|
|
if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
|
||
|
|
need_sync = 1;
|
||
|
|
}
|
||
|
|
if ( need_sync ) {
|
||
|
|
if (bcf_hdr_sync(h) < 0) goto error;
|
||
|
|
}
|
||
|
|
free(names);
|
||
|
|
tbx_destroy(idx);
|
||
|
|
}
|
||
|
|
free(txt.s);
|
||
|
|
return h;
|
||
|
|
|
||
|
|
error:
|
||
|
|
if (idx) tbx_destroy(idx);
|
||
|
|
free(names);
|
||
|
|
free(txt.s);
|
||
|
|
if (h) bcf_hdr_destroy(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
|
||
|
|
{
|
||
|
|
int i = 0, n = 0, save_errno;
|
||
|
|
char **lines = hts_readlines(fname, &n);
|
||
|
|
if ( !lines ) return 1;
|
||
|
|
for (i=0; i<n-1; i++)
|
||
|
|
{
|
||
|
|
int k;
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
|
||
|
|
if (!hrec) goto fail;
|
||
|
|
if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
|
||
|
|
bcf_hrec_destroy(hrec);
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
free(lines[i]);
|
||
|
|
lines[i] = NULL;
|
||
|
|
}
|
||
|
|
if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
|
||
|
|
if (bcf_hdr_sync(hdr) < 0) goto fail;
|
||
|
|
free(lines[n-1]);
|
||
|
|
free(lines);
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
save_errno = errno;
|
||
|
|
for (; i < n; i++)
|
||
|
|
free(lines[i]);
|
||
|
|
free(lines);
|
||
|
|
errno = save_errno;
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
|
||
|
|
{
|
||
|
|
uint32_t e = 0;
|
||
|
|
if ( !hrec->value )
|
||
|
|
{
|
||
|
|
int j, nout = 0;
|
||
|
|
e |= ksprintf(str, "##%s=<", hrec->key) < 0;
|
||
|
|
for (j=0; j<hrec->nkeys; j++)
|
||
|
|
{
|
||
|
|
// do not output IDX if output is VCF
|
||
|
|
if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
|
||
|
|
if ( nout ) e |= kputc(',',str) < 0;
|
||
|
|
e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
|
||
|
|
nout++;
|
||
|
|
}
|
||
|
|
e |= ksprintf(str,">\n") < 0;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
|
||
|
|
|
||
|
|
return e == 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
|
||
|
|
{
|
||
|
|
return _bcf_hrec_format(hrec,0,str);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
|
||
|
|
{
|
||
|
|
int i, r = 0;
|
||
|
|
for (i=0; i<hdr->nhrec; i++)
|
||
|
|
r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
|
||
|
|
|
||
|
|
r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
|
||
|
|
if ( bcf_hdr_nsamples(hdr) )
|
||
|
|
{
|
||
|
|
r |= ksprintf(str, "\tFORMAT") < 0;
|
||
|
|
for (i=0; i<bcf_hdr_nsamples(hdr); i++)
|
||
|
|
r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
|
||
|
|
}
|
||
|
|
r |= ksprintf(str, "\n") < 0;
|
||
|
|
|
||
|
|
return r ? -1 : 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
|
||
|
|
{
|
||
|
|
kstring_t txt = {0,0,0};
|
||
|
|
if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
|
||
|
|
return NULL;
|
||
|
|
if ( len ) *len = txt.l;
|
||
|
|
return txt.s;
|
||
|
|
}
|
||
|
|
|
||
|
|
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
|
||
|
|
{
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
|
||
|
|
int i, tid, m = kh_size(d);
|
||
|
|
const char **names = (const char**) calloc(m,sizeof(const char*));
|
||
|
|
if ( !names )
|
||
|
|
{
|
||
|
|
hts_log_error("Failed to allocate memory");
|
||
|
|
*n = 0;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
khint_t k;
|
||
|
|
for (k=kh_begin(d); k<kh_end(d); k++)
|
||
|
|
{
|
||
|
|
if ( !kh_exist(d,k) ) continue;
|
||
|
|
if ( !kh_val(d, k).hrec[0] ) continue; // removed via bcf_hdr_remove
|
||
|
|
tid = kh_val(d,k).id;
|
||
|
|
if ( tid >= m )
|
||
|
|
{
|
||
|
|
// This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
|
||
|
|
if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
|
||
|
|
{
|
||
|
|
hts_log_error("Failed to allocate memory");
|
||
|
|
*n = 0;
|
||
|
|
free(names);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
m = tid + 1;
|
||
|
|
}
|
||
|
|
names[tid] = kh_key(d,k);
|
||
|
|
}
|
||
|
|
// ensure there are no gaps
|
||
|
|
for (i=0,tid=0; tid<m; i++,tid++)
|
||
|
|
{
|
||
|
|
while ( tid<m && !names[tid] ) tid++;
|
||
|
|
if ( tid==m ) break;
|
||
|
|
if ( i==tid ) continue;
|
||
|
|
names[i] = names[tid];
|
||
|
|
names[tid] = 0;
|
||
|
|
}
|
||
|
|
*n = i;
|
||
|
|
return names;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
|
||
|
|
{
|
||
|
|
kstring_t htxt = {0,0,0};
|
||
|
|
if (bcf_hdr_format(h, 0, &htxt) < 0) {
|
||
|
|
free(htxt.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
|
||
|
|
int ret;
|
||
|
|
if ( fp->format.compression!=no_compression ) {
|
||
|
|
ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
|
||
|
|
if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
|
||
|
|
} else {
|
||
|
|
ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
|
||
|
|
}
|
||
|
|
free(htxt.s);
|
||
|
|
return ret<0 ? -1 : 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/***********************
|
||
|
|
*** Typed value I/O ***
|
||
|
|
***********************/
|
||
|
|
|
||
|
|
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
|
||
|
|
{
|
||
|
|
int32_t max = INT32_MIN, min = INT32_MAX;
|
||
|
|
int i;
|
||
|
|
if (n <= 0) {
|
||
|
|
return bcf_enc_size(s, 0, BCF_BT_NULL);
|
||
|
|
} else if (n == 1) {
|
||
|
|
return bcf_enc_int1(s, a[0]);
|
||
|
|
} else {
|
||
|
|
if (wsize <= 0) wsize = n;
|
||
|
|
|
||
|
|
// Equivalent to:
|
||
|
|
// for (i = 0; i < n; ++i) {
|
||
|
|
// if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
|
||
|
|
// continue;
|
||
|
|
// if (max < a[i]) max = a[i];
|
||
|
|
// if (min > a[i]) min = a[i];
|
||
|
|
// }
|
||
|
|
int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
|
||
|
|
int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
|
||
|
|
for (i = 0; i < (n&~3); i+=4) {
|
||
|
|
// bcf_int32_missing == INT32_MIN and
|
||
|
|
// bcf_int32_vector_end == INT32_MIN+1.
|
||
|
|
// We skip these, but can mostly avoid explicit checking
|
||
|
|
if (max4[0] < a[i+0]) max4[0] = a[i+0];
|
||
|
|
if (max4[1] < a[i+1]) max4[1] = a[i+1];
|
||
|
|
if (max4[2] < a[i+2]) max4[2] = a[i+2];
|
||
|
|
if (max4[3] < a[i+3]) max4[3] = a[i+3];
|
||
|
|
if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
|
||
|
|
if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
|
||
|
|
if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
|
||
|
|
if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
|
||
|
|
}
|
||
|
|
min = min4[0];
|
||
|
|
if (min > min4[1]) min = min4[1];
|
||
|
|
if (min > min4[2]) min = min4[2];
|
||
|
|
if (min > min4[3]) min = min4[3];
|
||
|
|
max = max4[0];
|
||
|
|
if (max < max4[1]) max = max4[1];
|
||
|
|
if (max < max4[2]) max = max4[2];
|
||
|
|
if (max < max4[3]) max = max4[3];
|
||
|
|
for (; i < n; ++i) {
|
||
|
|
if (max < a[i]) max = a[i];
|
||
|
|
if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
|
||
|
|
}
|
||
|
|
|
||
|
|
if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
|
||
|
|
if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
|
||
|
|
ks_resize(s, s->l + n) < 0)
|
||
|
|
return -1;
|
||
|
|
uint8_t *p = (uint8_t *) s->s + s->l;
|
||
|
|
for (i = 0; i < n; ++i, p++) {
|
||
|
|
if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end;
|
||
|
|
else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
|
||
|
|
else *p = a[i];
|
||
|
|
}
|
||
|
|
s->l += n;
|
||
|
|
} else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
|
||
|
|
uint8_t *p;
|
||
|
|
if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
|
||
|
|
ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
|
||
|
|
return -1;
|
||
|
|
p = (uint8_t *) s->s + s->l;
|
||
|
|
for (i = 0; i < n; ++i)
|
||
|
|
{
|
||
|
|
int16_t x;
|
||
|
|
if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
|
||
|
|
else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
|
||
|
|
else x = a[i];
|
||
|
|
i16_to_le(x, p);
|
||
|
|
p += sizeof(int16_t);
|
||
|
|
}
|
||
|
|
s->l += n * sizeof(int16_t);
|
||
|
|
} else {
|
||
|
|
uint8_t *p;
|
||
|
|
if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
|
||
|
|
ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
|
||
|
|
return -1;
|
||
|
|
p = (uint8_t *) s->s + s->l;
|
||
|
|
for (i = 0; i < n; ++i) {
|
||
|
|
i32_to_le(a[i], p);
|
||
|
|
p += sizeof(int32_t);
|
||
|
|
}
|
||
|
|
s->l += n * sizeof(int32_t);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
static int bcf_enc_long1(kstring_t *s, int64_t x) {
|
||
|
|
uint32_t e = 0;
|
||
|
|
if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
|
||
|
|
return bcf_enc_int1(s, x);
|
||
|
|
if (x == bcf_int64_vector_end) {
|
||
|
|
e |= bcf_enc_size(s, 1, BCF_BT_INT8);
|
||
|
|
e |= kputc(bcf_int8_vector_end, s) < 0;
|
||
|
|
} else if (x == bcf_int64_missing) {
|
||
|
|
e |= bcf_enc_size(s, 1, BCF_BT_INT8);
|
||
|
|
e |= kputc(bcf_int8_missing, s) < 0;
|
||
|
|
} else {
|
||
|
|
e |= bcf_enc_size(s, 1, BCF_BT_INT64);
|
||
|
|
e |= ks_expand(s, 8);
|
||
|
|
if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
|
||
|
|
}
|
||
|
|
return e == 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
|
||
|
|
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
|
||
|
|
uint8_t *p;
|
||
|
|
size_t i;
|
||
|
|
size_t bytes = n * sizeof(float);
|
||
|
|
|
||
|
|
if (bytes / sizeof(float) != n) return -1;
|
||
|
|
if (ks_resize(s, s->l + bytes) < 0) return -1;
|
||
|
|
|
||
|
|
p = (uint8_t *) s->s + s->l;
|
||
|
|
for (i = 0; i < n; i++) {
|
||
|
|
float_to_le(a[i], p);
|
||
|
|
p += sizeof(float);
|
||
|
|
}
|
||
|
|
s->l += bytes;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
|
||
|
|
{
|
||
|
|
assert(n >= 0);
|
||
|
|
bcf_enc_size(s, n, BCF_BT_FLOAT);
|
||
|
|
serialize_float_array(s, n, a);
|
||
|
|
return 0; // FIXME: check for errs in this function
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
|
||
|
|
{
|
||
|
|
bcf_enc_size(s, l, BCF_BT_CHAR);
|
||
|
|
kputsn(a, l, s);
|
||
|
|
return 0; // FIXME: check for errs in this function
|
||
|
|
}
|
||
|
|
|
||
|
|
// Special case of n==1 as it also occurs quite often in FORMAT data.
|
||
|
|
// This version is also small enough to get inlined.
|
||
|
|
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
|
||
|
|
uint32_t e = 0;
|
||
|
|
uint8_t *p = (uint8_t *)data;
|
||
|
|
int32_t v;
|
||
|
|
|
||
|
|
// helps gcc more than clang here. In billions of cycles:
|
||
|
|
// bcf_fmt_array1 bcf_fmt_array
|
||
|
|
// gcc7: 23.2 24.3
|
||
|
|
// gcc13: 21.6 23.0
|
||
|
|
// clang13: 27.1 27.8
|
||
|
|
switch (type) {
|
||
|
|
case BCF_BT_CHAR:
|
||
|
|
e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BCF_BT_INT8:
|
||
|
|
if (*(int8_t *)p != bcf_int8_vector_end) {
|
||
|
|
e |= ((*(int8_t *)p == bcf_int8_missing)
|
||
|
|
? kputc_('.', s)
|
||
|
|
: kputw(*(int8_t *)p, s)) < 0;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
case BCF_BT_INT16:
|
||
|
|
v = le_to_i16(p);
|
||
|
|
if (v != bcf_int16_vector_end) {
|
||
|
|
e |= (v == bcf_int16_missing
|
||
|
|
? kputc_('.', s)
|
||
|
|
: kputw(v, s)) < 0;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BCF_BT_INT32:
|
||
|
|
v = le_to_i32(p);
|
||
|
|
if (v != bcf_int32_vector_end) {
|
||
|
|
e |= (v == bcf_int32_missing
|
||
|
|
? kputc_('.', s)
|
||
|
|
: kputw(v, s)) < 0;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BCF_BT_FLOAT:
|
||
|
|
v = le_to_u32(p);
|
||
|
|
if (v != bcf_float_vector_end) {
|
||
|
|
e |= (v == bcf_float_missing
|
||
|
|
? kputc_('.', s)
|
||
|
|
: kputd(le_to_float(p), s)) < 0;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_error("Unexpected type %d", type);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return e == 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
|
||
|
|
{
|
||
|
|
int j = 0;
|
||
|
|
uint32_t e = 0;
|
||
|
|
if (n == 0) {
|
||
|
|
return kputc_('.', s) >= 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (type == BCF_BT_CHAR)
|
||
|
|
{
|
||
|
|
char *p = (char *)data;
|
||
|
|
|
||
|
|
// Note bcf_str_missing is already accounted for in n==0 above.
|
||
|
|
if (n >= 8) {
|
||
|
|
char *p_end = memchr(p, 0, n);
|
||
|
|
e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
|
||
|
|
} else {
|
||
|
|
for (j = 0; j < n && *p; ++j, ++p)
|
||
|
|
e |= kputc(*p, s) < 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
#define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
|
||
|
|
uint8_t *p = (uint8_t *) data; \
|
||
|
|
for (j=0; j<n; j++, p += sizeof(type_t)) \
|
||
|
|
{ \
|
||
|
|
type_t v = convert(p); \
|
||
|
|
if ( is_vector_end ) break; \
|
||
|
|
if ( j ) e |= kputc_(',', s) < 0; \
|
||
|
|
e |= (is_missing ? kputc('.', s) : kprint) < 0; \
|
||
|
|
} \
|
||
|
|
}
|
||
|
|
switch (type) {
|
||
|
|
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break;
|
||
|
|
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
|
||
|
|
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
|
||
|
|
case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
|
||
|
|
default: hts_log_error("Unexpected type %d", type); exit(1); break;
|
||
|
|
}
|
||
|
|
#undef BRANCH
|
||
|
|
}
|
||
|
|
return e == 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
|
||
|
|
{
|
||
|
|
int x, type;
|
||
|
|
x = bcf_dec_size(ptr, &ptr, &type);
|
||
|
|
bcf_fmt_array(s, x, type, ptr);
|
||
|
|
return ptr + (x << bcf_type_shift[type]);
|
||
|
|
}
|
||
|
|
|
||
|
|
/********************
|
||
|
|
*** VCF site I/O ***
|
||
|
|
********************/
|
||
|
|
|
||
|
|
typedef struct {
|
||
|
|
int key; // Key for h->id[BCF_DT_ID][key] vdict
|
||
|
|
int max_m; // number of elements in field array (ie commas)
|
||
|
|
int size; // field size (max_l or max_g*4 if is_gt)
|
||
|
|
int offset; // offset of buf into h->mem
|
||
|
|
uint32_t is_gt:1, // is genotype
|
||
|
|
max_g:31; // maximum number of genotypes
|
||
|
|
uint32_t max_l; // length of field
|
||
|
|
uint32_t y; // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
|
||
|
|
uint8_t *buf; // Pointer into h->mem
|
||
|
|
} fmt_aux_t;
|
||
|
|
|
||
|
|
// fmt_aux_t field notes:
|
||
|
|
// max_* are biggest sizes of the various FORMAT fields across all samples.
|
||
|
|
// We use these after pivoting the data to ensure easy random access
|
||
|
|
// of a specific sample.
|
||
|
|
//
|
||
|
|
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
|
||
|
|
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
|
||
|
|
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
|
||
|
|
//
|
||
|
|
// These are computed in vcf_parse_format_max3 and used in
|
||
|
|
// vcf_parse_format_alloc4 to get the size.
|
||
|
|
//
|
||
|
|
// size is computed from max_g, max_l, max_m and is_gt. Once computed
|
||
|
|
// the max values are never accessed again.
|
||
|
|
//
|
||
|
|
// In theory all 4 vars could be coalesced into a single variable, but this
|
||
|
|
// significantly harms speed (even if done via a union). It's about 25-30%
|
||
|
|
// slower.
|
||
|
|
|
||
|
|
static inline int align_mem(kstring_t *s)
|
||
|
|
{
|
||
|
|
int e = 0;
|
||
|
|
if (s->l&7) {
|
||
|
|
uint64_t zero = 0;
|
||
|
|
e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
|
||
|
|
}
|
||
|
|
return e == 0 ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
#define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
|
||
|
|
|
||
|
|
// detect FORMAT "."
|
||
|
|
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
const char *p, const char *q) {
|
||
|
|
const char *end = s->s + s->l;
|
||
|
|
if ( q>=end )
|
||
|
|
{
|
||
|
|
hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_NCOLS;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
v->n_fmt = 0;
|
||
|
|
if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
|
||
|
|
{
|
||
|
|
v->n_sample = bcf_hdr_nsamples(h);
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// get format information from the dictionary
|
||
|
|
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
const char *p, const char *q, fmt_aux_t *fmt) {
|
||
|
|
const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
|
||
|
|
char *t;
|
||
|
|
int j;
|
||
|
|
ks_tokaux_t aux1;
|
||
|
|
|
||
|
|
for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
|
||
|
|
if (j >= MAX_N_FMT) {
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
|
||
|
|
bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
*(char*)aux1.p = 0;
|
||
|
|
khint_t k = kh_get(vdict, d, t);
|
||
|
|
if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
|
||
|
|
if ( t[0]=='.' && t[1]==0 )
|
||
|
|
{
|
||
|
|
hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
kstring_t tmp = {0,0,0};
|
||
|
|
int l;
|
||
|
|
ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
||
|
|
free(tmp.s);
|
||
|
|
int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
|
||
|
|
if (res < 0) bcf_hrec_destroy(hrec);
|
||
|
|
if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
|
||
|
|
|
||
|
|
k = kh_get(vdict, d, t);
|
||
|
|
v->errcode |= BCF_ERR_TAG_UNDEF;
|
||
|
|
if (res || k == kh_end(d)) {
|
||
|
|
hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
|
||
|
|
fmt[j].key = kh_val(d, k).id;
|
||
|
|
fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
|
||
|
|
fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
|
||
|
|
v->n_fmt++;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// compute max
|
||
|
|
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
char *p, char *q, fmt_aux_t *fmt) {
|
||
|
|
int n_sample_ori = -1;
|
||
|
|
char *r = q + 1; // r: position in the format string
|
||
|
|
int l = 0, m = 1, g = 1, j;
|
||
|
|
v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
|
||
|
|
const char *end = s->s + s->l;
|
||
|
|
|
||
|
|
while ( r<end )
|
||
|
|
{
|
||
|
|
// can we skip some samples?
|
||
|
|
if ( h->keep_samples )
|
||
|
|
{
|
||
|
|
n_sample_ori++;
|
||
|
|
if ( !bit_array_test(h->keep_samples,n_sample_ori) )
|
||
|
|
{
|
||
|
|
while ( *r!='\t' && r<end ) r++;
|
||
|
|
if ( *r=='\t' ) { *r = 0; r++; }
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// collect fmt stats: max vector size, length, number of alleles
|
||
|
|
j = 0; // j-th format field
|
||
|
|
fmt_aux_t *f = fmt;
|
||
|
|
static char meta[256] = {
|
||
|
|
// \0 \t , / : |
|
||
|
|
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||
|
|
};
|
||
|
|
|
||
|
|
char *r_start = r;
|
||
|
|
for (;;) {
|
||
|
|
// Quickly skip ahead to an appropriate meta-character
|
||
|
|
while (!meta[(unsigned char)*r]) r++;
|
||
|
|
|
||
|
|
switch (*r) {
|
||
|
|
case ',':
|
||
|
|
m++;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case '|':
|
||
|
|
case '/':
|
||
|
|
if (f->is_gt) g++;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case '\t':
|
||
|
|
*r = 0; // fall through
|
||
|
|
|
||
|
|
default: // valid due to while loop above.
|
||
|
|
case '\0':
|
||
|
|
case ':':
|
||
|
|
l = r - r_start; r_start = r;
|
||
|
|
if (f->max_m < m) f->max_m = m;
|
||
|
|
if (f->max_l < l) f->max_l = l;
|
||
|
|
if (f->is_gt && f->max_g < g) f->max_g = g;
|
||
|
|
l = 0, m = g = 1;
|
||
|
|
if ( *r==':' ) {
|
||
|
|
j++; f++;
|
||
|
|
if ( j>=v->n_fmt ) {
|
||
|
|
hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
|
||
|
|
h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_NCOLS;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
} else goto end_for;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if ( r>=end ) break;
|
||
|
|
r++;
|
||
|
|
}
|
||
|
|
end_for:
|
||
|
|
v->n_sample++;
|
||
|
|
if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
|
||
|
|
r++;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// allocate memory for arrays
|
||
|
|
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
const char *p, const char *q,
|
||
|
|
fmt_aux_t *fmt) {
|
||
|
|
kstring_t *mem = (kstring_t*)&h->mem;
|
||
|
|
|
||
|
|
int j;
|
||
|
|
for (j = 0; j < v->n_fmt; ++j) {
|
||
|
|
fmt_aux_t *f = &fmt[j];
|
||
|
|
if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
|
||
|
|
|
||
|
|
if ((f->y>>4&0xf) == BCF_HT_STR) {
|
||
|
|
f->size = f->is_gt? f->max_g << 2 : f->max_l;
|
||
|
|
} else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
|
||
|
|
f->size = f->max_m << 2;
|
||
|
|
} else {
|
||
|
|
hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (align_mem(mem) < 0) {
|
||
|
|
hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Limit the total memory to ~2Gb per VCF row. This should mean
|
||
|
|
// malformed VCF data is less likely to take excessive memory and/or
|
||
|
|
// time.
|
||
|
|
if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
|
||
|
|
static int warned = 0;
|
||
|
|
if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
warned = 1;
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
f->size = -1;
|
||
|
|
f->offset = 0;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
f->offset = mem->l;
|
||
|
|
if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
|
||
|
|
hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
mem->l += v->n_sample * f->size;
|
||
|
|
}
|
||
|
|
|
||
|
|
{
|
||
|
|
int j;
|
||
|
|
for (j = 0; j < v->n_fmt; ++j)
|
||
|
|
fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
|
||
|
|
}
|
||
|
|
|
||
|
|
// check for duplicate tags
|
||
|
|
int i;
|
||
|
|
for (i=1; i<v->n_fmt; i++)
|
||
|
|
{
|
||
|
|
fmt_aux_t *ifmt = &fmt[i];
|
||
|
|
if ( ifmt->size==-1 ) continue; // already marked for removal
|
||
|
|
for (j=0; j<i; j++)
|
||
|
|
{
|
||
|
|
fmt_aux_t *jfmt = &fmt[j];
|
||
|
|
if ( jfmt->size==-1 ) continue; // already marked for removal
|
||
|
|
if ( ifmt->key!=jfmt->key ) continue;
|
||
|
|
static int warned = 0;
|
||
|
|
if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
warned = 1;
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
ifmt->size = -1;
|
||
|
|
ifmt->offset = 0;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fill the sample fields
|
||
|
|
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
const char *p, const char *q, fmt_aux_t *fmt) {
|
||
|
|
static int extreme_val_warned = 0;
|
||
|
|
int n_sample_ori = -1;
|
||
|
|
// At beginning of the loop t points to the first char of a format
|
||
|
|
const char *t = q + 1;
|
||
|
|
int m = 0; // m: sample id
|
||
|
|
const int nsamples = bcf_hdr_nsamples(h);
|
||
|
|
|
||
|
|
const char *end = s->s + s->l;
|
||
|
|
while ( t<end )
|
||
|
|
{
|
||
|
|
// can we skip some samples?
|
||
|
|
if ( h->keep_samples )
|
||
|
|
{
|
||
|
|
n_sample_ori++;
|
||
|
|
if ( !bit_array_test(h->keep_samples,n_sample_ori) )
|
||
|
|
{
|
||
|
|
while ( *t && t<end ) t++;
|
||
|
|
t++;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( m == nsamples ) break;
|
||
|
|
|
||
|
|
int j = 0; // j-th format field, m-th sample
|
||
|
|
while ( t < end )
|
||
|
|
{
|
||
|
|
fmt_aux_t *z = &fmt[j++];
|
||
|
|
const int htype = z->y>>4&0xf;
|
||
|
|
if (!z->buf) {
|
||
|
|
hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
|
||
|
|
z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( z->size==-1 )
|
||
|
|
{
|
||
|
|
// this field is to be ignored, it's either too big or a duplicate
|
||
|
|
while ( *t != ':' && *t ) t++;
|
||
|
|
}
|
||
|
|
else if (htype == BCF_HT_STR) {
|
||
|
|
int l;
|
||
|
|
if (z->is_gt) {
|
||
|
|
// Genotypes.
|
||
|
|
// <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
|
||
|
|
int32_t is_phased = 0;
|
||
|
|
uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
|
||
|
|
uint32_t unreadable = 0;
|
||
|
|
uint32_t max = 0;
|
||
|
|
int overflow = 0;
|
||
|
|
for (l = 0;; ++t) {
|
||
|
|
if (*t == '.') {
|
||
|
|
++t, x[l++] = is_phased;
|
||
|
|
} else {
|
||
|
|
const char *tt = t;
|
||
|
|
uint32_t val;
|
||
|
|
// Or "v->n_allele < 10", but it doesn't
|
||
|
|
// seem to be any faster and this feels safer.
|
||
|
|
if (*t >= '0' && *t <= '9' &&
|
||
|
|
!(t[1] >= '0' && t[1] <= '9')) {
|
||
|
|
val = *t++ - '0';
|
||
|
|
} else {
|
||
|
|
val = hts_str2uint(t, (char **)&t,
|
||
|
|
sizeof(val) * CHAR_MAX - 2,
|
||
|
|
&overflow);
|
||
|
|
unreadable |= tt == t;
|
||
|
|
}
|
||
|
|
if (max < val) max = val;
|
||
|
|
x[l++] = (val + 1) << 1 | is_phased;
|
||
|
|
}
|
||
|
|
is_phased = (*t == '|');
|
||
|
|
if (*t != '|' && *t != '/') break;
|
||
|
|
}
|
||
|
|
// Possibly check max against v->n_allele instead?
|
||
|
|
if (overflow || max > (INT32_MAX >> 1) - 1) {
|
||
|
|
hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if (unreadable) {
|
||
|
|
hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if ( !l ) x[l++] = 0; // An empty field, insert missing value
|
||
|
|
for (; l < z->size>>2; ++l)
|
||
|
|
x[l] = bcf_int32_vector_end;
|
||
|
|
|
||
|
|
} else {
|
||
|
|
// Otherwise arbitrary strings
|
||
|
|
char *x = (char*)z->buf + z->size * (size_t)m;
|
||
|
|
for (l = 0; *t != ':' && *t; ++t)
|
||
|
|
x[l++] = *t;
|
||
|
|
if (z->size > l)
|
||
|
|
memset(&x[l], 0, (z->size-l) * sizeof(*x));
|
||
|
|
}
|
||
|
|
|
||
|
|
} else if (htype == BCF_HT_INT) {
|
||
|
|
// One or more integers in an array
|
||
|
|
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
|
||
|
|
int l;
|
||
|
|
for (l = 0;; ++t) {
|
||
|
|
if (*t == '.') {
|
||
|
|
x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
|
||
|
|
} else {
|
||
|
|
int overflow = 0;
|
||
|
|
char *te;
|
||
|
|
long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
|
||
|
|
if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
|
||
|
|
{
|
||
|
|
if ( !extreme_val_warned )
|
||
|
|
{
|
||
|
|
hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
|
||
|
|
h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
extreme_val_warned = 1;
|
||
|
|
}
|
||
|
|
tmp_val = bcf_int32_missing;
|
||
|
|
}
|
||
|
|
x[l++] = tmp_val;
|
||
|
|
t = te;
|
||
|
|
}
|
||
|
|
if (*t != ',') break;
|
||
|
|
}
|
||
|
|
if ( !l )
|
||
|
|
x[l++] = bcf_int32_missing;
|
||
|
|
for (; l < z->size>>2; ++l)
|
||
|
|
x[l] = bcf_int32_vector_end;
|
||
|
|
|
||
|
|
} else if (htype == BCF_HT_REAL) {
|
||
|
|
// One of more floating point values in an array
|
||
|
|
float *x = (float*)(z->buf + z->size * (size_t)m);
|
||
|
|
int l;
|
||
|
|
for (l = 0;; ++t) {
|
||
|
|
if (*t == '.' && !isdigit_c(t[1])) {
|
||
|
|
bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
|
||
|
|
} else {
|
||
|
|
int overflow = 0;
|
||
|
|
char *te;
|
||
|
|
float tmp_val = hts_str2dbl(t, &te, &overflow);
|
||
|
|
if ( (te==t || overflow) && !extreme_val_warned )
|
||
|
|
{
|
||
|
|
hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
|
||
|
|
extreme_val_warned = 1;
|
||
|
|
}
|
||
|
|
x[l++] = tmp_val;
|
||
|
|
t = te;
|
||
|
|
}
|
||
|
|
if (*t != ',') break;
|
||
|
|
}
|
||
|
|
if ( !l )
|
||
|
|
// An empty field, insert missing value
|
||
|
|
bcf_float_set_missing(x[l++]);
|
||
|
|
for (; l < z->size>>2; ++l)
|
||
|
|
bcf_float_set_vector_end(x[l]);
|
||
|
|
} else {
|
||
|
|
hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (*t == '\0') {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
else if (*t == ':') {
|
||
|
|
t++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
char buffer[8];
|
||
|
|
hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
|
||
|
|
hts_strprint(buffer, sizeof buffer, '\'', t, 1),
|
||
|
|
h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_CHAR;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// fill end-of-vector values
|
||
|
|
for (; j < v->n_fmt; ++j) {
|
||
|
|
fmt_aux_t *z = &fmt[j];
|
||
|
|
const int htype = z->y>>4&0xf;
|
||
|
|
int l;
|
||
|
|
|
||
|
|
if (z->size == -1) // this field is to be ignored
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (htype == BCF_HT_STR) {
|
||
|
|
if (z->is_gt) {
|
||
|
|
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
|
||
|
|
if (z->size) x[0] = bcf_int32_missing;
|
||
|
|
for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
|
||
|
|
} else {
|
||
|
|
char *x = (char*)z->buf + z->size * (size_t)m;
|
||
|
|
if ( z->size ) {
|
||
|
|
x[0] = '.';
|
||
|
|
memset(&x[1], 0, (z->size-1) * sizeof(*x));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else if (htype == BCF_HT_INT) {
|
||
|
|
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
|
||
|
|
x[0] = bcf_int32_missing;
|
||
|
|
for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
|
||
|
|
} else if (htype == BCF_HT_REAL) {
|
||
|
|
float *x = (float*)(z->buf + z->size * (size_t)m);
|
||
|
|
bcf_float_set_missing(x[0]);
|
||
|
|
for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
m++; t++;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// write individual genotype information
|
||
|
|
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
const char *p, const char *q, fmt_aux_t *fmt) {
|
||
|
|
kstring_t *str = &v->indiv;
|
||
|
|
int i, need_downsize = 0;
|
||
|
|
if (v->n_sample > 0) {
|
||
|
|
for (i = 0; i < v->n_fmt; ++i) {
|
||
|
|
fmt_aux_t *z = &fmt[i];
|
||
|
|
if ( z->size==-1 ) {
|
||
|
|
need_downsize = 1;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
bcf_enc_int1(str, z->key);
|
||
|
|
if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
|
||
|
|
bcf_enc_size(str, z->size, BCF_BT_CHAR);
|
||
|
|
kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
|
||
|
|
} else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
|
||
|
|
bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
|
||
|
|
} else {
|
||
|
|
bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
|
||
|
|
if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
|
||
|
|
(float *) z->buf) != 0) {
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
if ( need_downsize ) {
|
||
|
|
i = 0;
|
||
|
|
while ( i < v->n_fmt ) {
|
||
|
|
if ( fmt[i].size==-1 )
|
||
|
|
{
|
||
|
|
v->n_fmt--;
|
||
|
|
if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
i++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// validity checking
|
||
|
|
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
|
||
|
|
if ( v->n_sample!=bcf_hdr_nsamples(h) )
|
||
|
|
{
|
||
|
|
hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
|
||
|
|
bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
|
||
|
|
v->errcode |= BCF_ERR_NCOLS;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if ( v->indiv.l > 0xffffffff )
|
||
|
|
{
|
||
|
|
hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
|
||
|
|
// Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
|
||
|
|
v->n_fmt = 0;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// p,q is the start and the end of the FORMAT field
|
||
|
|
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
|
||
|
|
char *p, char *q)
|
||
|
|
{
|
||
|
|
if ( !bcf_hdr_nsamples(h) ) return 0;
|
||
|
|
kstring_t *mem = (kstring_t*)&h->mem;
|
||
|
|
mem->l = 0;
|
||
|
|
|
||
|
|
fmt_aux_t fmt[MAX_N_FMT];
|
||
|
|
|
||
|
|
// detect FORMAT "."
|
||
|
|
int ret; // +ve = ok, -ve = err
|
||
|
|
if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
|
||
|
|
return ret ? 0 : -1;
|
||
|
|
|
||
|
|
// get format information from the dictionary
|
||
|
|
if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
|
||
|
|
// stored as per-type arrays AAA... BBB... CCC... This is basically
|
||
|
|
// a data rotation or pivot.
|
||
|
|
|
||
|
|
// The size of elements in the array grow to their maximum needed,
|
||
|
|
// permitting fast random access. This means however we have to first
|
||
|
|
// scan the whole FORMAT line to find the maximum of each type, and
|
||
|
|
// then scan it again to find the store the data.
|
||
|
|
// We break this down into compute-max, allocate, fill-out-buffers
|
||
|
|
|
||
|
|
// TODO: ?
|
||
|
|
// The alternative would be to pivot on the first pass, with fixed
|
||
|
|
// size entries for numerics and concatenated strings otherwise, also
|
||
|
|
// tracking maximum sizes. Then on a second pass we reallocate and
|
||
|
|
// copy the data again to a uniformly sized array. Two passes through
|
||
|
|
// memory, but without doubling string parsing.
|
||
|
|
|
||
|
|
// compute max
|
||
|
|
if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// allocate memory for arrays
|
||
|
|
if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// fill the sample fields; at beginning of the loop
|
||
|
|
if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// write individual genotype information
|
||
|
|
if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// validity checking
|
||
|
|
if (vcf_parse_format_check7(h, v) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
|
||
|
|
// Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
|
||
|
|
// been already printed, but will enable tools like vcfcheck to proceed.
|
||
|
|
|
||
|
|
kstring_t tmp = {0,0,0};
|
||
|
|
khint_t k;
|
||
|
|
int l;
|
||
|
|
if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
|
||
|
|
return kh_end(d);
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
||
|
|
free(tmp.s);
|
||
|
|
int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
|
||
|
|
if (res < 0) bcf_hrec_destroy(hrec);
|
||
|
|
if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
|
||
|
|
k = kh_get(vdict, d, p);
|
||
|
|
|
||
|
|
return k;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
|
||
|
|
int i, n_flt = 1, max_n_flt = 0;
|
||
|
|
char *r, *t;
|
||
|
|
int32_t *a_flt = NULL;
|
||
|
|
ks_tokaux_t aux1;
|
||
|
|
khint_t k;
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
|
||
|
|
// count the number of filters
|
||
|
|
if (*(q-1) == ';') *(q-1) = 0;
|
||
|
|
for (r = p; *r; ++r)
|
||
|
|
if (*r == ';') ++n_flt;
|
||
|
|
if (n_flt > max_n_flt) {
|
||
|
|
a_flt = malloc(n_flt * sizeof(*a_flt));
|
||
|
|
if (!a_flt) {
|
||
|
|
hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
max_n_flt = n_flt;
|
||
|
|
}
|
||
|
|
// add filters
|
||
|
|
for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
|
||
|
|
*(char*)aux1.p = 0;
|
||
|
|
k = kh_get(vdict, d, t);
|
||
|
|
if (k == kh_end(d))
|
||
|
|
{
|
||
|
|
// Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
|
||
|
|
// been already printed, but will enable tools like vcfcheck to proceed.
|
||
|
|
hts_log_warning("FILTER '%s' is not defined in the header", t);
|
||
|
|
kstring_t tmp = {0,0,0};
|
||
|
|
int l;
|
||
|
|
ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
||
|
|
free(tmp.s);
|
||
|
|
int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
|
||
|
|
if (res < 0) bcf_hrec_destroy(hrec);
|
||
|
|
if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
|
||
|
|
k = kh_get(vdict, d, t);
|
||
|
|
v->errcode |= BCF_ERR_TAG_UNDEF;
|
||
|
|
if (res || k == kh_end(d)) {
|
||
|
|
hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
free(a_flt);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
a_flt[i++] = kh_val(d, k).id;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_enc_vint(str, n_flt, a_flt, -1);
|
||
|
|
free(a_flt);
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
|
||
|
|
static int extreme_int_warned = 0, negative_rlen_warned = 0;
|
||
|
|
int max_n_val = 0, overflow = 0;
|
||
|
|
char *r, *key;
|
||
|
|
khint_t k;
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
|
||
|
|
int32_t *a_val = NULL;
|
||
|
|
|
||
|
|
v->n_info = 0;
|
||
|
|
if (*(q-1) == ';') *(q-1) = 0;
|
||
|
|
for (r = key = p;; ++r) {
|
||
|
|
int c;
|
||
|
|
char *val, *end;
|
||
|
|
while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
|
||
|
|
if (v->n_info == UINT16_MAX) {
|
||
|
|
hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
|
||
|
|
bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
val = end = NULL;
|
||
|
|
c = *r; *r = 0;
|
||
|
|
if (c == '=') {
|
||
|
|
val = r + 1;
|
||
|
|
|
||
|
|
for (end = val; *end != ';' && *end != 0; ++end);
|
||
|
|
c = *end; *end = 0;
|
||
|
|
} else end = r;
|
||
|
|
if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
|
||
|
|
k = kh_get(vdict, d, key);
|
||
|
|
if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
|
||
|
|
{
|
||
|
|
hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
|
||
|
|
kstring_t tmp = {0,0,0};
|
||
|
|
int l;
|
||
|
|
ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
|
||
|
|
bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
||
|
|
free(tmp.s);
|
||
|
|
int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
|
||
|
|
if (res < 0) bcf_hrec_destroy(hrec);
|
||
|
|
if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
|
||
|
|
k = kh_get(vdict, d, key);
|
||
|
|
v->errcode |= BCF_ERR_TAG_UNDEF;
|
||
|
|
if (res || k == kh_end(d)) {
|
||
|
|
hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
|
||
|
|
++v->n_info;
|
||
|
|
bcf_enc_int1(str, kh_val(d, k).id);
|
||
|
|
if (val == 0) {
|
||
|
|
bcf_enc_size(str, 0, BCF_BT_NULL);
|
||
|
|
} else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
|
||
|
|
bcf_enc_vchar(str, end - val, val);
|
||
|
|
} else { // int/float value/array
|
||
|
|
int i, n_val;
|
||
|
|
char *t, *te;
|
||
|
|
for (t = val, n_val = 1; *t; ++t) // count the number of values
|
||
|
|
if (*t == ',') ++n_val;
|
||
|
|
// Check both int and float size in one step for simplicity
|
||
|
|
if (n_val > max_n_val) {
|
||
|
|
int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
|
||
|
|
if (!a_tmp) {
|
||
|
|
hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
a_val = a_tmp;
|
||
|
|
max_n_val = n_val;
|
||
|
|
}
|
||
|
|
if ((y>>4&0xf) == BCF_HT_INT) {
|
||
|
|
i = 0, t = val;
|
||
|
|
int64_t val1;
|
||
|
|
int is_int64 = 0;
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
if ( n_val==1 )
|
||
|
|
{
|
||
|
|
overflow = 0;
|
||
|
|
long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
|
||
|
|
if ( te==val ) tmp_val = bcf_int32_missing;
|
||
|
|
else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
|
||
|
|
{
|
||
|
|
if ( !extreme_int_warned )
|
||
|
|
{
|
||
|
|
hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
extreme_int_warned = 1;
|
||
|
|
}
|
||
|
|
tmp_val = bcf_int32_missing;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
is_int64 = 1;
|
||
|
|
val1 = tmp_val;
|
||
|
|
t = te;
|
||
|
|
i = 1; // this is just to avoid adding another nested block...
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
for (; i < n_val; ++i, ++t)
|
||
|
|
{
|
||
|
|
overflow = 0;
|
||
|
|
long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
|
||
|
|
if ( te==t ) tmp_val = bcf_int32_missing;
|
||
|
|
else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
|
||
|
|
{
|
||
|
|
if ( !extreme_int_warned )
|
||
|
|
{
|
||
|
|
hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
extreme_int_warned = 1;
|
||
|
|
}
|
||
|
|
tmp_val = bcf_int32_missing;
|
||
|
|
}
|
||
|
|
a_val[i] = tmp_val;
|
||
|
|
for (t = te; *t && *t != ','; t++);
|
||
|
|
}
|
||
|
|
if (n_val == 1) {
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
if ( is_int64 )
|
||
|
|
{
|
||
|
|
v->unpacked |= BCF_IS_64BIT;
|
||
|
|
bcf_enc_long1(str, val1);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
bcf_enc_int1(str, (int32_t)val1);
|
||
|
|
#else
|
||
|
|
val1 = a_val[0];
|
||
|
|
bcf_enc_int1(str, (int32_t)val1);
|
||
|
|
#endif
|
||
|
|
} else {
|
||
|
|
bcf_enc_vint(str, n_val, a_val, -1);
|
||
|
|
}
|
||
|
|
if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
|
||
|
|
&& memcmp(key, "END", 4) == 0)
|
||
|
|
{
|
||
|
|
if ( val1 <= v->pos )
|
||
|
|
{
|
||
|
|
if ( !negative_rlen_warned )
|
||
|
|
{
|
||
|
|
hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
|
||
|
|
negative_rlen_warned = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
v->rlen = val1 - v->pos;
|
||
|
|
}
|
||
|
|
} else if ((y>>4&0xf) == BCF_HT_REAL) {
|
||
|
|
float *val_f = (float *)a_val;
|
||
|
|
for (i = 0, t = val; i < n_val; ++i, ++t)
|
||
|
|
{
|
||
|
|
overflow = 0;
|
||
|
|
val_f[i] = hts_str2dbl(t, &te, &overflow);
|
||
|
|
if ( te==t || overflow ) // conversion failed
|
||
|
|
bcf_float_set_missing(val_f[i]);
|
||
|
|
for (t = te; *t && *t != ','; t++);
|
||
|
|
}
|
||
|
|
bcf_enc_vfloat(str, n_val, val_f);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (c == 0) break;
|
||
|
|
r = end;
|
||
|
|
key = r + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
free(a_val);
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
free(a_val);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
|
||
|
|
{
|
||
|
|
int ret = -2, overflow = 0;
|
||
|
|
char *p, *q, *r, *t;
|
||
|
|
kstring_t *str;
|
||
|
|
khint_t k;
|
||
|
|
ks_tokaux_t aux;
|
||
|
|
|
||
|
|
//#define NOT_DOT(p) strcmp((p), ".")
|
||
|
|
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
|
||
|
|
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
|
||
|
|
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
|
||
|
|
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
|
||
|
|
|
||
|
|
if (!s || !h || !v || !(s->s))
|
||
|
|
return ret;
|
||
|
|
|
||
|
|
// Assumed in lots of places, but we may as well spot this early
|
||
|
|
assert(sizeof(float) == sizeof(int32_t));
|
||
|
|
|
||
|
|
// Ensure string we parse has space to permit some over-flow when during
|
||
|
|
// parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over
|
||
|
|
// the more straight forward looking strcmp, giving a speed advantage.
|
||
|
|
if (ks_resize(s, s->l+4) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// Force our memory to be initialised so we avoid the technicality of
|
||
|
|
// undefined behaviour in using a 4-byte memcmp. (The reality is this
|
||
|
|
// almost certainly is never detected by the compiler so has no impact,
|
||
|
|
// but equally so this code has minimal (often beneficial) impact on
|
||
|
|
// performance too.)
|
||
|
|
s->s[s->l+0] = 0;
|
||
|
|
s->s[s->l+1] = 0;
|
||
|
|
s->s[s->l+2] = 0;
|
||
|
|
s->s[s->l+3] = 0;
|
||
|
|
|
||
|
|
bcf_clear1(v);
|
||
|
|
str = &v->shared;
|
||
|
|
memset(&aux, 0, sizeof(ks_tokaux_t));
|
||
|
|
|
||
|
|
// CHROM
|
||
|
|
if (!(p = kstrtok(s->s, "\t", &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
|
||
|
|
k = kh_get(vdict, d, p);
|
||
|
|
if (k == kh_end(d)) {
|
||
|
|
hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
|
||
|
|
v->errcode = BCF_ERR_CTG_UNDEF;
|
||
|
|
if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
|
||
|
|
hts_log_error("Could not add dummy header for contig '%s'", p);
|
||
|
|
v->errcode |= BCF_ERR_CTG_INVALID;
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
v->rid = kh_val(d, k).id;
|
||
|
|
|
||
|
|
// POS
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
overflow = 0;
|
||
|
|
char *tmp = p;
|
||
|
|
v->pos = hts_str2uint(p, &p, 62, &overflow);
|
||
|
|
if (overflow) {
|
||
|
|
hts_log_error("Position value '%s' is too large", tmp);
|
||
|
|
goto err;
|
||
|
|
} else if ( *p ) {
|
||
|
|
hts_log_error("Could not parse the position '%s'", tmp);
|
||
|
|
goto err;
|
||
|
|
} else {
|
||
|
|
v->pos -= 1;
|
||
|
|
}
|
||
|
|
if (v->pos >= INT32_MAX)
|
||
|
|
v->unpacked |= BCF_IS_64BIT;
|
||
|
|
|
||
|
|
// ID
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
|
||
|
|
else bcf_enc_size(str, 0, BCF_BT_CHAR);
|
||
|
|
|
||
|
|
// REF
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
bcf_enc_vchar(str, q - p, p);
|
||
|
|
v->n_allele = 1, v->rlen = q - p;
|
||
|
|
|
||
|
|
// ALT
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
if (NOT_DOT(p)) {
|
||
|
|
for (r = t = p;; ++r) {
|
||
|
|
if (*r == ',' || *r == 0) {
|
||
|
|
if (v->n_allele == UINT16_MAX) {
|
||
|
|
hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
|
||
|
|
bcf_seqname_safe(h,v), v->pos+1);
|
||
|
|
v->errcode |= BCF_ERR_LIMITS;
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
bcf_enc_vchar(str, r - t, t);
|
||
|
|
t = r + 1;
|
||
|
|
++v->n_allele;
|
||
|
|
}
|
||
|
|
if (r == q) break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// QUAL
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
if (NOT_DOT(p)) v->qual = atof(p);
|
||
|
|
else bcf_float_set_missing(v->qual);
|
||
|
|
if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
|
||
|
|
|
||
|
|
// FILTER
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
if (NOT_DOT(p)) {
|
||
|
|
if (vcf_parse_filter(str, h, v, p, q)) {
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
} else bcf_enc_vint(str, 0, 0, -1);
|
||
|
|
if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
|
||
|
|
|
||
|
|
// INFO
|
||
|
|
if (!(p = kstrtok(0, 0, &aux)))
|
||
|
|
goto err;
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
if (NOT_DOT(p)) {
|
||
|
|
if (vcf_parse_info(str, h, v, p, q)) {
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
|
||
|
|
|
||
|
|
// FORMAT; optional
|
||
|
|
p = kstrtok(0, 0, &aux);
|
||
|
|
if (p) {
|
||
|
|
*(q = (char*)aux.p) = 0;
|
||
|
|
|
||
|
|
return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
|
||
|
|
} else {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
end:
|
||
|
|
ret = 0;
|
||
|
|
|
||
|
|
err:
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_open_mode(char *mode, const char *fn, const char *format)
|
||
|
|
{
|
||
|
|
if (format == NULL) {
|
||
|
|
// Try to pick a format based on the filename extension
|
||
|
|
char extension[HTS_MAX_EXT_LEN];
|
||
|
|
if (find_file_extension(fn, extension) < 0) return -1;
|
||
|
|
return vcf_open_mode(mode, fn, extension);
|
||
|
|
}
|
||
|
|
else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
|
||
|
|
else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
|
||
|
|
else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
|
||
|
|
else return -1;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
|
||
|
|
{
|
||
|
|
int ret;
|
||
|
|
ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
|
||
|
|
if (ret < 0) return ret;
|
||
|
|
return vcf_parse1(&fp->line, h, v);
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
|
||
|
|
{
|
||
|
|
uint8_t *ptr_start = ptr;
|
||
|
|
fmt->id = bcf_dec_typed_int1(ptr, &ptr);
|
||
|
|
fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
|
||
|
|
fmt->size = fmt->n << bcf_type_shift[fmt->type];
|
||
|
|
fmt->p = ptr;
|
||
|
|
fmt->p_off = ptr - ptr_start;
|
||
|
|
fmt->p_free = 0;
|
||
|
|
ptr += n_sample * fmt->size;
|
||
|
|
fmt->p_len = ptr - fmt->p;
|
||
|
|
return ptr;
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
|
||
|
|
{
|
||
|
|
uint8_t *ptr_start = ptr;
|
||
|
|
int64_t len = 0;
|
||
|
|
info->key = bcf_dec_typed_int1(ptr, &ptr);
|
||
|
|
len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
|
||
|
|
info->vptr = ptr;
|
||
|
|
info->vptr_off = ptr - ptr_start;
|
||
|
|
info->vptr_free = 0;
|
||
|
|
info->v1.i = 0;
|
||
|
|
if (info->len == 1) {
|
||
|
|
switch(info->type) {
|
||
|
|
case BCF_BT_INT8:
|
||
|
|
case BCF_BT_CHAR:
|
||
|
|
info->v1.i = *(int8_t*)ptr;
|
||
|
|
break;
|
||
|
|
case BCF_BT_INT16:
|
||
|
|
info->v1.i = le_to_i16(ptr);
|
||
|
|
len <<= 1;
|
||
|
|
break;
|
||
|
|
case BCF_BT_INT32:
|
||
|
|
info->v1.i = le_to_i32(ptr);
|
||
|
|
len <<= 2;
|
||
|
|
break;
|
||
|
|
case BCF_BT_FLOAT:
|
||
|
|
info->v1.f = le_to_float(ptr);
|
||
|
|
len <<= 2;
|
||
|
|
break;
|
||
|
|
case BCF_BT_INT64:
|
||
|
|
info->v1.i = le_to_i64(ptr);
|
||
|
|
len <<= 3;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
len <<= bcf_type_shift[info->type];
|
||
|
|
}
|
||
|
|
ptr += len;
|
||
|
|
|
||
|
|
info->vptr_len = ptr - info->vptr;
|
||
|
|
return ptr;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_unpack(bcf1_t *b, int which)
|
||
|
|
{
|
||
|
|
if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
|
||
|
|
uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
|
||
|
|
int i;
|
||
|
|
bcf_dec_t *d = &b->d;
|
||
|
|
if (which & BCF_UN_FLT) which |= BCF_UN_STR;
|
||
|
|
if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
|
||
|
|
if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
|
||
|
|
{
|
||
|
|
kstring_t tmp;
|
||
|
|
|
||
|
|
// ID
|
||
|
|
tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
|
||
|
|
ptr_ori = ptr;
|
||
|
|
ptr = bcf_fmt_sized_array(&tmp, ptr);
|
||
|
|
b->unpack_size[0] = ptr - ptr_ori;
|
||
|
|
kputc_('\0', &tmp);
|
||
|
|
d->id = tmp.s; d->m_id = tmp.m;
|
||
|
|
|
||
|
|
// REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
|
||
|
|
hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
|
||
|
|
tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
|
||
|
|
ptr_ori = ptr;
|
||
|
|
for (i = 0; i < b->n_allele; ++i) {
|
||
|
|
// Use offset within tmp.s as realloc may change pointer
|
||
|
|
d->allele[i] = (char *)(intptr_t)tmp.l;
|
||
|
|
ptr = bcf_fmt_sized_array(&tmp, ptr);
|
||
|
|
kputc_('\0', &tmp);
|
||
|
|
}
|
||
|
|
b->unpack_size[1] = ptr - ptr_ori;
|
||
|
|
d->als = tmp.s; d->m_als = tmp.m;
|
||
|
|
|
||
|
|
// Convert our offsets within tmp.s back to pointers again
|
||
|
|
for (i = 0; i < b->n_allele; ++i)
|
||
|
|
d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
|
||
|
|
b->unpacked |= BCF_UN_STR;
|
||
|
|
}
|
||
|
|
if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
|
||
|
|
ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
|
||
|
|
ptr_ori = ptr;
|
||
|
|
if (*ptr>>4) {
|
||
|
|
int type;
|
||
|
|
d->n_flt = bcf_dec_size(ptr, &ptr, &type);
|
||
|
|
hts_expand(int, d->n_flt, d->m_flt, d->flt);
|
||
|
|
for (i = 0; i < d->n_flt; ++i)
|
||
|
|
d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
|
||
|
|
} else ++ptr, d->n_flt = 0;
|
||
|
|
b->unpack_size[2] = ptr - ptr_ori;
|
||
|
|
b->unpacked |= BCF_UN_FLT;
|
||
|
|
}
|
||
|
|
if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
|
||
|
|
ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
|
||
|
|
hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
|
||
|
|
for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
|
||
|
|
for (i = 0; i < b->n_info; ++i)
|
||
|
|
ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
|
||
|
|
b->unpacked |= BCF_UN_INFO;
|
||
|
|
}
|
||
|
|
if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
|
||
|
|
ptr = (uint8_t*)b->indiv.s;
|
||
|
|
hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
|
||
|
|
for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
|
||
|
|
for (i = 0; i < b->n_fmt; ++i)
|
||
|
|
ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
|
||
|
|
b->unpacked |= BCF_UN_FMT;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
int32_t max_dt_id = h->n[BCF_DT_ID];
|
||
|
|
const char *chrom = bcf_seqname(h, v);
|
||
|
|
if (!chrom) {
|
||
|
|
hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
|
||
|
|
v->rid);
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
|
||
|
|
|
||
|
|
// Cache of key lengths so we don't keep repeatedly using them.
|
||
|
|
// This assumes we're not modifying the header between successive calls
|
||
|
|
// to vcf_format, but that would lead to many other forms of breakage
|
||
|
|
// so it feels like a valid assumption to make.
|
||
|
|
//
|
||
|
|
// We cannot just do this in bcf_hdr_sync as some code (eg bcftools
|
||
|
|
// annotate) manipulates the headers directly without calling sync to
|
||
|
|
// refresh the data structures. So we must do just-in-time length
|
||
|
|
// calculation during writes instead.
|
||
|
|
bcf_hdr_aux_t *aux = get_hdr_aux(h);
|
||
|
|
if (!aux->key_len) {
|
||
|
|
if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
size_t *key_len = aux->key_len;
|
||
|
|
|
||
|
|
kputs(chrom, s); // CHROM
|
||
|
|
kputc_('\t', s); kputll(v->pos + 1, s); // POS
|
||
|
|
kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
|
||
|
|
kputc_('\t', s); // REF
|
||
|
|
if (v->n_allele > 0) kputs(v->d.allele[0], s);
|
||
|
|
else kputc_('.', s);
|
||
|
|
kputc_('\t', s); // ALT
|
||
|
|
if (v->n_allele > 1) {
|
||
|
|
for (i = 1; i < v->n_allele; ++i) {
|
||
|
|
if (i > 1) kputc_(',', s);
|
||
|
|
kputs(v->d.allele[i], s);
|
||
|
|
}
|
||
|
|
} else kputc_('.', s);
|
||
|
|
kputc_('\t', s); // QUAL
|
||
|
|
if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
|
||
|
|
else kputd(v->qual, s);
|
||
|
|
kputc_('\t', s); // FILTER
|
||
|
|
if (v->d.n_flt) {
|
||
|
|
for (i = 0; i < v->d.n_flt; ++i) {
|
||
|
|
int32_t idx = v->d.flt[i];
|
||
|
|
if (idx < 0 || idx >= max_dt_id
|
||
|
|
|| h->id[BCF_DT_ID][idx].key == NULL) {
|
||
|
|
hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
|
||
|
|
idx, bcf_seqname_safe(h, v), v->pos + 1);
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if (i) kputc_(';', s);
|
||
|
|
if (!key_len[idx])
|
||
|
|
key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
|
||
|
|
kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
|
||
|
|
}
|
||
|
|
} else kputc_('.', s);
|
||
|
|
|
||
|
|
kputc_('\t', s); // INFO
|
||
|
|
if (v->n_info) {
|
||
|
|
uint8_t *ptr = v->shared.s
|
||
|
|
? (uint8_t *)v->shared.s + v->unpack_size[0] +
|
||
|
|
v->unpack_size[1] + v->unpack_size[2]
|
||
|
|
: NULL;
|
||
|
|
int first = 1;
|
||
|
|
bcf_info_t *info = v->d.info;
|
||
|
|
|
||
|
|
// Note if we duplicate this code into custom packed and unpacked
|
||
|
|
// implementations then we gain a bit more speed, particularly with
|
||
|
|
// clang 13 (up to 5%). Not sure why this is, but code duplication
|
||
|
|
// isn't pleasant and it's still faster adding packed support than
|
||
|
|
// not so it's a win, just not as good as it should be.
|
||
|
|
const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
|
||
|
|
for (i = 0; i < v->n_info; ++i) {
|
||
|
|
bcf_info_t in, *z;
|
||
|
|
if (info_packed) {
|
||
|
|
// Use a local bcf_info_t when data is packed
|
||
|
|
z = ∈
|
||
|
|
z->key = bcf_dec_typed_int1(ptr, &ptr);
|
||
|
|
z->len = bcf_dec_size(ptr, &ptr, &z->type);
|
||
|
|
z->vptr = ptr;
|
||
|
|
ptr += z->len << bcf_type_shift[z->type];
|
||
|
|
} else {
|
||
|
|
// Else previously unpacked INFO struct
|
||
|
|
z = &info[i];
|
||
|
|
|
||
|
|
// Also potentially since deleted
|
||
|
|
if ( !z->vptr ) continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
|
||
|
|
? &h->id[BCF_DT_ID][z->key]
|
||
|
|
: NULL;
|
||
|
|
|
||
|
|
if (!id || !id->key) {
|
||
|
|
hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
|
||
|
|
z->key,
|
||
|
|
z->key < 0 ? "negative"
|
||
|
|
: (z->key >= max_dt_id ? "too large" : "not present in the header"),
|
||
|
|
bcf_seqname_safe(h, v), v->pos+1);
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// KEY
|
||
|
|
if (!key_len[z->key])
|
||
|
|
key_len[z->key] = strlen(id->key);
|
||
|
|
size_t id_len = key_len[z->key];
|
||
|
|
if (ks_resize(s, s->l + 3 + id_len) < 0)
|
||
|
|
return -1;
|
||
|
|
char *sptr = s->s + s->l;
|
||
|
|
if ( !first ) {
|
||
|
|
*sptr++ = ';';
|
||
|
|
s->l++;
|
||
|
|
}
|
||
|
|
first = 0;
|
||
|
|
memcpy(sptr, id->key, id_len);
|
||
|
|
s->l += id_len;
|
||
|
|
|
||
|
|
// VALUE
|
||
|
|
if (z->len <= 0) continue;
|
||
|
|
sptr[id_len] = '=';
|
||
|
|
s->l++;
|
||
|
|
|
||
|
|
if (z->len != 1 || info_packed) {
|
||
|
|
bcf_fmt_array(s, z->len, z->type, z->vptr);
|
||
|
|
} else {
|
||
|
|
// Single length vectors are unpacked into their
|
||
|
|
// own info.v1 union and handled separately.
|
||
|
|
if (z->type == BCF_BT_FLOAT) {
|
||
|
|
if ( bcf_float_is_missing(z->v1.f) )
|
||
|
|
kputc_('.', s);
|
||
|
|
else
|
||
|
|
kputd(z->v1.f, s);
|
||
|
|
} else if (z->type == BCF_BT_CHAR) {
|
||
|
|
kputc_(z->v1.i, s);
|
||
|
|
} else if (z->type < BCF_BT_INT64) {
|
||
|
|
int64_t missing[] = {
|
||
|
|
0, // BCF_BT_NULL
|
||
|
|
bcf_int8_missing,
|
||
|
|
bcf_int16_missing,
|
||
|
|
bcf_int32_missing,
|
||
|
|
};
|
||
|
|
if (z->v1.i == missing[z->type])
|
||
|
|
kputc_('.', s);
|
||
|
|
else
|
||
|
|
kputw(z->v1.i, s);
|
||
|
|
} else if (z->type == BCF_BT_INT64) {
|
||
|
|
if (z->v1.i == bcf_int64_missing)
|
||
|
|
kputc_('.', s);
|
||
|
|
else
|
||
|
|
kputll(z->v1.i, s);
|
||
|
|
} else {
|
||
|
|
hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( first ) kputc_('.', s);
|
||
|
|
} else kputc_('.', s);
|
||
|
|
|
||
|
|
// FORMAT and individual information
|
||
|
|
if (v->n_sample) {
|
||
|
|
int i,j;
|
||
|
|
if ( v->n_fmt) {
|
||
|
|
uint8_t *ptr = (uint8_t *)v->indiv.s;
|
||
|
|
int gt_i = -1;
|
||
|
|
bcf_fmt_t *fmt = v->d.fmt;
|
||
|
|
int first = 1;
|
||
|
|
int fmt_packed = !(v->unpacked & BCF_UN_FMT);
|
||
|
|
|
||
|
|
if (fmt_packed) {
|
||
|
|
// Local fmt as we have an array of num FORMAT keys,
|
||
|
|
// each of which points to N.Sample values.
|
||
|
|
|
||
|
|
// No real gain to be had in handling unpacked data here,
|
||
|
|
// but it doesn't cost us much in complexity either and
|
||
|
|
// it gives us flexibility.
|
||
|
|
fmt = malloc(v->n_fmt * sizeof(*fmt));
|
||
|
|
if (!fmt)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// KEYS
|
||
|
|
for (i = 0; i < (int)v->n_fmt; ++i) {
|
||
|
|
bcf_fmt_t *z;
|
||
|
|
z = &fmt[i];
|
||
|
|
if (fmt_packed) {
|
||
|
|
z->id = bcf_dec_typed_int1(ptr, &ptr);
|
||
|
|
z->n = bcf_dec_size(ptr, &ptr, &z->type);
|
||
|
|
z->p = ptr;
|
||
|
|
z->size = z->n << bcf_type_shift[z->type];
|
||
|
|
ptr += v->n_sample * z->size;
|
||
|
|
}
|
||
|
|
if ( !z->p ) continue;
|
||
|
|
kputc_(!first ? ':' : '\t', s); first = 0;
|
||
|
|
|
||
|
|
bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
|
||
|
|
? &h->id[BCF_DT_ID][z->id]
|
||
|
|
: NULL;
|
||
|
|
|
||
|
|
if (!id || !id->key) {
|
||
|
|
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
|
||
|
|
errno = EINVAL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!key_len[z->id])
|
||
|
|
key_len[z->id] = strlen(id->key);
|
||
|
|
size_t id_len = key_len[z->id];
|
||
|
|
kputsn(id->key, id_len, s);
|
||
|
|
if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
|
||
|
|
gt_i = i;
|
||
|
|
}
|
||
|
|
if ( first ) kputsn("\t.", 2, s);
|
||
|
|
|
||
|
|
// VALUES per sample
|
||
|
|
for (j = 0; j < v->n_sample; ++j) {
|
||
|
|
kputc_('\t', s);
|
||
|
|
first = 1;
|
||
|
|
bcf_fmt_t *f = fmt;
|
||
|
|
for (i = 0; i < (int)v->n_fmt; i++, f++) {
|
||
|
|
if ( !f->p ) continue;
|
||
|
|
if (!first) kputc_(':', s);
|
||
|
|
first = 0;
|
||
|
|
if (gt_i == i) {
|
||
|
|
bcf_format_gt(f,j,s);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
else if (f->n == 1)
|
||
|
|
bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
|
||
|
|
else
|
||
|
|
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Simpler loop post GT and at least 1 iteration
|
||
|
|
for (i++, f++; i < (int)v->n_fmt; i++, f++) {
|
||
|
|
if ( !f->p ) continue;
|
||
|
|
kputc_(':', s);
|
||
|
|
if (f->n == 1)
|
||
|
|
bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
|
||
|
|
else
|
||
|
|
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
|
||
|
|
}
|
||
|
|
if ( first ) kputc_('.', s);
|
||
|
|
}
|
||
|
|
if (fmt_packed)
|
||
|
|
free(fmt);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
for (j=0; j<=v->n_sample; j++)
|
||
|
|
kputsn("\t.", 2, s);
|
||
|
|
}
|
||
|
|
kputc('\n', s);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_write_line(htsFile *fp, kstring_t *line)
|
||
|
|
{
|
||
|
|
int ret;
|
||
|
|
if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
|
||
|
|
if ( fp->format.compression!=no_compression )
|
||
|
|
ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
|
||
|
|
else
|
||
|
|
ret = hwrite(fp->fp.hfile, line->s, line->l);
|
||
|
|
return ret==line->l ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
|
||
|
|
{
|
||
|
|
ssize_t ret;
|
||
|
|
fp->line.l = 0;
|
||
|
|
if (vcf_format1(h, v, &fp->line) != 0)
|
||
|
|
return -1;
|
||
|
|
if ( fp->format.compression!=no_compression ) {
|
||
|
|
if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
|
||
|
|
return -1;
|
||
|
|
if (fp->idx && !fp->fp.bgzf->mt)
|
||
|
|
hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
|
||
|
|
ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
|
||
|
|
} else {
|
||
|
|
ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fp->idx && fp->format.compression == bgzf) {
|
||
|
|
int tid;
|
||
|
|
if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
|
||
|
|
tid, v->pos, v->pos + v->rlen,
|
||
|
|
bgzf_tell(fp->fp.bgzf), 1) < 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return ret==fp->line.l ? 0 : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/************************
|
||
|
|
* Data access routines *
|
||
|
|
************************/
|
||
|
|
|
||
|
|
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
|
||
|
|
{
|
||
|
|
khint_t k;
|
||
|
|
vdict_t *d = (vdict_t*)h->dict[which];
|
||
|
|
k = kh_get(vdict, d, id);
|
||
|
|
return k == kh_end(d)? -1 : kh_val(d, k).id;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/********************
|
||
|
|
*** BCF indexing ***
|
||
|
|
********************/
|
||
|
|
|
||
|
|
// Calculate number of index levels given min_shift and the header contig
|
||
|
|
// list. Also returns number of contigs in *nids_out.
|
||
|
|
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
|
||
|
|
int starting_n_lvls, int *nids_out)
|
||
|
|
{
|
||
|
|
int n_lvls, i, nids = 0;
|
||
|
|
int64_t max_len = 0, s;
|
||
|
|
|
||
|
|
for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
|
||
|
|
{
|
||
|
|
if ( !h->id[BCF_DT_CTG][i].val ) continue;
|
||
|
|
if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
|
||
|
|
max_len = h->id[BCF_DT_CTG][i].val->info[0];
|
||
|
|
nids++;
|
||
|
|
}
|
||
|
|
if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken.
|
||
|
|
max_len += 256;
|
||
|
|
s = hts_bin_maxpos(min_shift, starting_n_lvls);
|
||
|
|
for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
|
||
|
|
|
||
|
|
if (nids_out) *nids_out = nids;
|
||
|
|
return n_lvls;
|
||
|
|
}
|
||
|
|
|
||
|
|
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
|
||
|
|
{
|
||
|
|
int n_lvls;
|
||
|
|
bcf1_t *b = NULL;
|
||
|
|
hts_idx_t *idx = NULL;
|
||
|
|
bcf_hdr_t *h;
|
||
|
|
int r;
|
||
|
|
h = bcf_hdr_read(fp);
|
||
|
|
if ( !h ) return NULL;
|
||
|
|
int nids = 0;
|
||
|
|
n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
|
||
|
|
idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
|
||
|
|
if (!idx) goto fail;
|
||
|
|
b = bcf_init1();
|
||
|
|
if (!b) goto fail;
|
||
|
|
while ((r = bcf_read1(fp,h, b)) >= 0) {
|
||
|
|
int ret;
|
||
|
|
ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
|
||
|
|
if (ret < 0) goto fail;
|
||
|
|
}
|
||
|
|
if (r < -1) goto fail;
|
||
|
|
hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
|
||
|
|
bcf_destroy1(b);
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
return idx;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
hts_idx_destroy(idx);
|
||
|
|
bcf_destroy1(b);
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
|
||
|
|
{
|
||
|
|
return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
|
||
|
|
}
|
||
|
|
|
||
|
|
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
|
||
|
|
{
|
||
|
|
return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
|
||
|
|
{
|
||
|
|
htsFile *fp;
|
||
|
|
hts_idx_t *idx;
|
||
|
|
tbx_t *tbx;
|
||
|
|
int ret;
|
||
|
|
if ((fp = hts_open(fn, "rb")) == 0) return -2;
|
||
|
|
if (n_threads)
|
||
|
|
hts_set_threads(fp, n_threads);
|
||
|
|
if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
|
||
|
|
switch (fp->format.format) {
|
||
|
|
case bcf:
|
||
|
|
if (!min_shift) {
|
||
|
|
hts_log_error("TBI indices for BCF files are not supported");
|
||
|
|
ret = -1;
|
||
|
|
} else {
|
||
|
|
idx = bcf_index(fp, min_shift);
|
||
|
|
if (idx) {
|
||
|
|
ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
|
||
|
|
if (ret < 0) ret = -4;
|
||
|
|
hts_idx_destroy(idx);
|
||
|
|
}
|
||
|
|
else ret = -1;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
case vcf:
|
||
|
|
tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
|
||
|
|
if (tbx) {
|
||
|
|
ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
|
||
|
|
if (ret < 0) ret = -4;
|
||
|
|
tbx_destroy(tbx);
|
||
|
|
}
|
||
|
|
else ret = -1;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
ret = -3;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
hts_close(fp);
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
|
||
|
|
{
|
||
|
|
return bcf_index_build3(fn, fnidx, min_shift, 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_index_build(const char *fn, int min_shift)
|
||
|
|
{
|
||
|
|
return bcf_index_build3(fn, NULL, min_shift, 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Initialise fp->idx for the current format type.
|
||
|
|
// This must be called after the header has been written but no other data.
|
||
|
|
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
|
||
|
|
int n_lvls, fmt;
|
||
|
|
|
||
|
|
if (min_shift == 0) {
|
||
|
|
min_shift = 14;
|
||
|
|
n_lvls = 5;
|
||
|
|
fmt = HTS_FMT_TBI;
|
||
|
|
} else {
|
||
|
|
// Set initial n_lvls to match tbx_index()
|
||
|
|
int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
|
||
|
|
// Increase if necessary
|
||
|
|
n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
|
||
|
|
fmt = HTS_FMT_CSI;
|
||
|
|
}
|
||
|
|
|
||
|
|
fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
|
||
|
|
if (!fp->idx) return -1;
|
||
|
|
|
||
|
|
// Tabix meta data, added even in CSI for VCF
|
||
|
|
uint8_t conf[4*7];
|
||
|
|
u32_to_le(TBX_VCF, conf+0); // fmt
|
||
|
|
u32_to_le(1, conf+4); // name col
|
||
|
|
u32_to_le(2, conf+8); // beg col
|
||
|
|
u32_to_le(0, conf+12); // end col
|
||
|
|
u32_to_le('#', conf+16); // comment
|
||
|
|
u32_to_le(0, conf+20); // n.skip
|
||
|
|
u32_to_le(0, conf+24); // ref name len
|
||
|
|
if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
|
||
|
|
hts_idx_destroy(fp->idx);
|
||
|
|
fp->idx = NULL;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
fp->fnidx = fnidx;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Initialise fp->idx for the current format type.
|
||
|
|
// This must be called after the header has been written but no other data.
|
||
|
|
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
|
||
|
|
int n_lvls, nids = 0;
|
||
|
|
|
||
|
|
if (fp->format.compression != bgzf) {
|
||
|
|
hts_log_error("Indexing is only supported on BGZF-compressed files");
|
||
|
|
return -3; // Matches no-compression return for bcf_index_build3()
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fp->format.format == vcf)
|
||
|
|
return vcf_idx_init(fp, h, min_shift, fnidx);
|
||
|
|
|
||
|
|
if (!min_shift)
|
||
|
|
min_shift = 14;
|
||
|
|
|
||
|
|
n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
|
||
|
|
|
||
|
|
fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
|
||
|
|
if (!fp->idx) return -1;
|
||
|
|
fp->fnidx = fnidx;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Finishes an index. Call after the last record has been written.
|
||
|
|
// Returns 0 on success, <0 on failure.
|
||
|
|
//
|
||
|
|
// NB: same format as SAM/BAM as it uses bgzf.
|
||
|
|
int bcf_idx_save(htsFile *fp) {
|
||
|
|
return sam_idx_save(fp);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*****************
|
||
|
|
*** Utilities ***
|
||
|
|
*****************/
|
||
|
|
|
||
|
|
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
|
||
|
|
{
|
||
|
|
int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
|
||
|
|
for (i=0; i<src->nhrec; i++)
|
||
|
|
{
|
||
|
|
if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
|
||
|
|
{
|
||
|
|
int j;
|
||
|
|
for (j=0; j<ndst_ori; j++)
|
||
|
|
{
|
||
|
|
if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
|
||
|
|
|
||
|
|
// Checking only the key part of generic lines, otherwise
|
||
|
|
// the VCFs are too verbose. Should we perhaps add a flag
|
||
|
|
// to bcf_hdr_combine() and make this optional?
|
||
|
|
if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
|
||
|
|
}
|
||
|
|
if ( j>=ndst_ori ) {
|
||
|
|
res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
||
|
|
if (res < 0) return -1;
|
||
|
|
need_sync += res;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if ( src->hrec[i]->type==BCF_HL_STR )
|
||
|
|
{
|
||
|
|
// NB: we are ignoring fields without ID
|
||
|
|
int j = bcf_hrec_find_key(src->hrec[i],"ID");
|
||
|
|
if ( j>=0 )
|
||
|
|
{
|
||
|
|
bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
|
||
|
|
if ( !rec ) {
|
||
|
|
res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
||
|
|
if (res < 0) return -1;
|
||
|
|
need_sync += res;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
int j = bcf_hrec_find_key(src->hrec[i],"ID");
|
||
|
|
assert( j>=0 ); // this should always be true for valid VCFs
|
||
|
|
|
||
|
|
bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
|
||
|
|
if ( !rec ) {
|
||
|
|
res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
||
|
|
if (res < 0) return -1;
|
||
|
|
need_sync += res;
|
||
|
|
} else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
|
||
|
|
{
|
||
|
|
// Check that both records are of the same type. The bcf_hdr_id2length
|
||
|
|
// macro cannot be used here because dst header is not synced yet.
|
||
|
|
vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
|
||
|
|
vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
|
||
|
|
khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
|
||
|
|
khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
|
||
|
|
if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
|
||
|
|
{
|
||
|
|
hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
|
||
|
|
src->hrec[i]->vals[0]);
|
||
|
|
ret |= 1;
|
||
|
|
}
|
||
|
|
if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
|
||
|
|
{
|
||
|
|
hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
|
||
|
|
src->hrec[i]->vals[0]);
|
||
|
|
ret |= 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( need_sync ) {
|
||
|
|
if (bcf_hdr_sync(dst) < 0) return -1;
|
||
|
|
}
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
|
||
|
|
{
|
||
|
|
if ( !dst )
|
||
|
|
{
|
||
|
|
// this will effectively strip existing IDX attributes from src to become dst
|
||
|
|
dst = bcf_hdr_init("r");
|
||
|
|
kstring_t htxt = {0,0,0};
|
||
|
|
if (bcf_hdr_format(src, 0, &htxt) < 0) {
|
||
|
|
free(htxt.s);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
|
||
|
|
bcf_hdr_destroy(dst);
|
||
|
|
dst = NULL;
|
||
|
|
}
|
||
|
|
free(htxt.s);
|
||
|
|
return dst;
|
||
|
|
}
|
||
|
|
|
||
|
|
int i, ndst_ori = dst->nhrec, need_sync = 0, res;
|
||
|
|
for (i=0; i<src->nhrec; i++)
|
||
|
|
{
|
||
|
|
if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
|
||
|
|
{
|
||
|
|
int j;
|
||
|
|
for (j=0; j<ndst_ori; j++)
|
||
|
|
{
|
||
|
|
if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
|
||
|
|
|
||
|
|
// Checking only the key part of generic lines, otherwise
|
||
|
|
// the VCFs are too verbose. Should we perhaps add a flag
|
||
|
|
// to bcf_hdr_combine() and make this optional?
|
||
|
|
if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
|
||
|
|
}
|
||
|
|
if ( j>=ndst_ori ) {
|
||
|
|
res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
||
|
|
if (res < 0) return NULL;
|
||
|
|
need_sync += res;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if ( src->hrec[i]->type==BCF_HL_STR )
|
||
|
|
{
|
||
|
|
// NB: we are ignoring fields without ID
|
||
|
|
int j = bcf_hrec_find_key(src->hrec[i],"ID");
|
||
|
|
if ( j>=0 )
|
||
|
|
{
|
||
|
|
bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
|
||
|
|
if ( !rec ) {
|
||
|
|
res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
||
|
|
if (res < 0) return NULL;
|
||
|
|
need_sync += res;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
int j = bcf_hrec_find_key(src->hrec[i],"ID");
|
||
|
|
assert( j>=0 ); // this should always be true for valid VCFs
|
||
|
|
|
||
|
|
bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
|
||
|
|
if ( !rec ) {
|
||
|
|
res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
||
|
|
if (res < 0) return NULL;
|
||
|
|
need_sync += res;
|
||
|
|
} else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
|
||
|
|
{
|
||
|
|
// Check that both records are of the same type. The bcf_hdr_id2length
|
||
|
|
// macro cannot be used here because dst header is not synced yet.
|
||
|
|
vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
|
||
|
|
vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
|
||
|
|
khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
|
||
|
|
khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
|
||
|
|
if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
|
||
|
|
{
|
||
|
|
hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
|
||
|
|
src->hrec[i]->vals[0]);
|
||
|
|
}
|
||
|
|
if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
|
||
|
|
{
|
||
|
|
hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
|
||
|
|
src->hrec[i]->vals[0]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( need_sync ) {
|
||
|
|
if (bcf_hdr_sync(dst) < 0) return NULL;
|
||
|
|
}
|
||
|
|
return dst;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
if ( line->errcode )
|
||
|
|
{
|
||
|
|
char errordescription[1024] = "";
|
||
|
|
hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)), bcf_seqname_safe(src_hdr,line), line->pos+1);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
|
||
|
|
if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
|
||
|
|
{
|
||
|
|
int dict;
|
||
|
|
for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
|
||
|
|
{
|
||
|
|
src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
|
||
|
|
for (i=0; i<src_hdr->n[dict]; i++)
|
||
|
|
{
|
||
|
|
if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
|
||
|
|
{
|
||
|
|
src_hdr->transl[dict][i] = -1;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
|
||
|
|
if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( !src_hdr->ntransl )
|
||
|
|
{
|
||
|
|
free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
|
||
|
|
free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
|
||
|
|
src_hdr->ntransl = -1;
|
||
|
|
}
|
||
|
|
if ( src_hdr->ntransl==-1 ) return 0;
|
||
|
|
}
|
||
|
|
bcf_unpack(line,BCF_UN_ALL);
|
||
|
|
|
||
|
|
// CHROM
|
||
|
|
if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
|
||
|
|
|
||
|
|
// FILTER
|
||
|
|
for (i=0; i<line->d.n_flt; i++)
|
||
|
|
{
|
||
|
|
int src_id = line->d.flt[i];
|
||
|
|
if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
|
||
|
|
line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
||
|
|
}
|
||
|
|
|
||
|
|
// INFO
|
||
|
|
for (i=0; i<line->n_info; i++)
|
||
|
|
{
|
||
|
|
int src_id = line->d.info[i].key;
|
||
|
|
int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
|
||
|
|
if ( dst_id<0 ) continue;
|
||
|
|
line->d.info[i].key = dst_id;
|
||
|
|
if ( !line->d.info[i].vptr ) continue; // skip deleted
|
||
|
|
int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
||
|
|
int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
||
|
|
if ( src_size==dst_size ) // can overwrite
|
||
|
|
{
|
||
|
|
uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
|
||
|
|
if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
|
||
|
|
else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
|
||
|
|
else { *(uint32_t*)vptr = (uint32_t)dst_id; }
|
||
|
|
}
|
||
|
|
else // must realloc
|
||
|
|
{
|
||
|
|
bcf_info_t *info = &line->d.info[i];
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
bcf_enc_int1(&str, dst_id);
|
||
|
|
bcf_enc_size(&str, info->len,info->type);
|
||
|
|
uint32_t vptr_off = str.l;
|
||
|
|
kputsn((char*)info->vptr, info->vptr_len, &str);
|
||
|
|
if( info->vptr_free ) free(info->vptr - info->vptr_off);
|
||
|
|
info->vptr_off = vptr_off;
|
||
|
|
info->vptr = (uint8_t*)str.s + info->vptr_off;
|
||
|
|
info->vptr_free = 1;
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_INF;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// FORMAT
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
{
|
||
|
|
int src_id = line->d.fmt[i].id;
|
||
|
|
int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
|
||
|
|
if ( dst_id<0 ) continue;
|
||
|
|
line->d.fmt[i].id = dst_id;
|
||
|
|
if( !line->d.fmt[i].p ) continue; // skip deleted
|
||
|
|
int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
||
|
|
int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
||
|
|
if ( src_size==dst_size ) // can overwrite
|
||
|
|
{
|
||
|
|
uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
|
||
|
|
if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
|
||
|
|
else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
|
||
|
|
else { i32_to_le(dst_id, p + 1); }
|
||
|
|
}
|
||
|
|
else // must realloc
|
||
|
|
{
|
||
|
|
bcf_fmt_t *fmt = &line->d.fmt[i];
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
bcf_enc_int1(&str, dst_id);
|
||
|
|
bcf_enc_size(&str, fmt->n, fmt->type);
|
||
|
|
uint32_t p_off = str.l;
|
||
|
|
kputsn((char*)fmt->p, fmt->p_len, &str);
|
||
|
|
if( fmt->p_free ) free(fmt->p - fmt->p_off);
|
||
|
|
fmt->p_off = p_off;
|
||
|
|
fmt->p = (uint8_t*)str.s + fmt->p_off;
|
||
|
|
fmt->p_free = 1;
|
||
|
|
line->d.indiv_dirty = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
|
||
|
|
{
|
||
|
|
bcf_hdr_t *hout = bcf_hdr_init("r");
|
||
|
|
if (!hout) {
|
||
|
|
hts_log_error("Failed to allocate bcf header");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
kstring_t htxt = {0,0,0};
|
||
|
|
if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
|
||
|
|
free(htxt.s);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
|
||
|
|
bcf_hdr_destroy(hout);
|
||
|
|
hout = NULL;
|
||
|
|
}
|
||
|
|
free(htxt.s);
|
||
|
|
return hout;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
|
||
|
|
{
|
||
|
|
void *names_hash = khash_str2int_init();
|
||
|
|
kstring_t htxt = {0,0,0};
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
bcf_hdr_t *h = bcf_hdr_init("w");
|
||
|
|
int r = 0;
|
||
|
|
if (!h || !names_hash) {
|
||
|
|
hts_log_error("Failed to allocate bcf header");
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
if (bcf_hdr_format(h0, 1, &htxt) < 0) {
|
||
|
|
hts_log_error("Failed to get header text");
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
|
||
|
|
int j;
|
||
|
|
for (j=0; j<n; j++) imap[j] = -1;
|
||
|
|
if ( bcf_hdr_nsamples(h0) > 0) {
|
||
|
|
char *p = find_chrom_header_line(htxt.s);
|
||
|
|
int i = 0, end = n? 8 : 7;
|
||
|
|
while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
|
||
|
|
if (i != end) {
|
||
|
|
hts_log_error("Wrong number of columns in header #CHROM line");
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
|
||
|
|
for (i = 0; i < n; ++i) {
|
||
|
|
if ( khash_str2int_has_key(names_hash,samples[i]) )
|
||
|
|
{
|
||
|
|
hts_log_error("Duplicate sample name \"%s\"", samples[i]);
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
|
||
|
|
if (imap[i] < 0) continue;
|
||
|
|
r |= kputc('\t', &str) < 0;
|
||
|
|
r |= kputs(samples[i], &str) < 0;
|
||
|
|
r |= khash_str2int_inc(names_hash,samples[i]) < 0;
|
||
|
|
}
|
||
|
|
} else r |= kputsn(htxt.s, htxt.l, &str) < 0;
|
||
|
|
while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
|
||
|
|
r |= kputc('\n',&str) < 0;
|
||
|
|
if (r) {
|
||
|
|
hts_log_error("%s", strerror(errno));
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
if ( bcf_hdr_parse(h, str.s) < 0 ) {
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
h = NULL;
|
||
|
|
}
|
||
|
|
free(str.s);
|
||
|
|
free(htxt.s);
|
||
|
|
khash_str2int_destroy(names_hash);
|
||
|
|
return h;
|
||
|
|
|
||
|
|
err:
|
||
|
|
ks_free(&str);
|
||
|
|
ks_free(&htxt);
|
||
|
|
khash_str2int_destroy(names_hash);
|
||
|
|
bcf_hdr_destroy(h);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
|
||
|
|
{
|
||
|
|
if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
|
||
|
|
|
||
|
|
int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
|
||
|
|
hdr->keep_samples = (uint8_t*) calloc(narr,1);
|
||
|
|
if (!hdr->keep_samples) return -1;
|
||
|
|
|
||
|
|
hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
|
||
|
|
if ( !samples )
|
||
|
|
{
|
||
|
|
// exclude all samples
|
||
|
|
khint_t k;
|
||
|
|
vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
|
||
|
|
new_dict = kh_init(vdict);
|
||
|
|
if (!new_dict) return -1;
|
||
|
|
|
||
|
|
bcf_hdr_nsamples(hdr) = 0;
|
||
|
|
|
||
|
|
for (k = kh_begin(d); k != kh_end(d); ++k)
|
||
|
|
if (kh_exist(d, k)) free((char*)kh_key(d, k));
|
||
|
|
kh_destroy(vdict, d);
|
||
|
|
hdr->dict[BCF_DT_SAMPLE] = new_dict;
|
||
|
|
if (bcf_hdr_sync(hdr) < 0) return -1;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( samples[0]=='^' )
|
||
|
|
for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
|
||
|
|
|
||
|
|
int idx, n, ret = 0;
|
||
|
|
char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
|
||
|
|
if ( !smpls ) return -1;
|
||
|
|
for (i=0; i<n; i++)
|
||
|
|
{
|
||
|
|
idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
|
||
|
|
if ( idx<0 )
|
||
|
|
{
|
||
|
|
if ( !ret ) ret = i+1;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
assert( idx<bcf_hdr_nsamples(hdr) );
|
||
|
|
if ( samples[0]=='^' )
|
||
|
|
bit_array_clear(hdr->keep_samples, idx);
|
||
|
|
else
|
||
|
|
bit_array_set(hdr->keep_samples, idx);
|
||
|
|
}
|
||
|
|
for (i=0; i<n; i++) free(smpls[i]);
|
||
|
|
free(smpls);
|
||
|
|
|
||
|
|
bcf_hdr_nsamples(hdr) = 0;
|
||
|
|
for (i=0; i<hdr->nsamples_ori; i++)
|
||
|
|
if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
|
||
|
|
|
||
|
|
if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// Make new list and dictionary with desired samples
|
||
|
|
char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
|
||
|
|
vdict_t *new_dict, *d;
|
||
|
|
int k, res;
|
||
|
|
if (!samples) return -1;
|
||
|
|
|
||
|
|
new_dict = kh_init(vdict);
|
||
|
|
if (!new_dict) {
|
||
|
|
free(samples);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
idx = 0;
|
||
|
|
for (i=0; i<hdr->nsamples_ori; i++) {
|
||
|
|
if ( bit_array_test(hdr->keep_samples,i) ) {
|
||
|
|
samples[idx] = hdr->samples[i];
|
||
|
|
k = kh_put(vdict, new_dict, hdr->samples[i], &res);
|
||
|
|
if (res < 0) {
|
||
|
|
free(samples);
|
||
|
|
kh_destroy(vdict, new_dict);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
kh_val(new_dict, k) = bcf_idinfo_def;
|
||
|
|
kh_val(new_dict, k).id = idx;
|
||
|
|
idx++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Delete desired samples from old dictionary, so we don't free them
|
||
|
|
d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
|
||
|
|
for (i=0; i < idx; i++) {
|
||
|
|
int k = kh_get(vdict, d, samples[i]);
|
||
|
|
if (k < kh_end(d)) kh_del(vdict, d, k);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Free everything else
|
||
|
|
for (k = kh_begin(d); k != kh_end(d); ++k)
|
||
|
|
if (kh_exist(d, k)) free((char*)kh_key(d, k));
|
||
|
|
kh_destroy(vdict, d);
|
||
|
|
hdr->dict[BCF_DT_SAMPLE] = new_dict;
|
||
|
|
|
||
|
|
free(hdr->samples);
|
||
|
|
hdr->samples = samples;
|
||
|
|
|
||
|
|
if (bcf_hdr_sync(hdr) < 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
|
||
|
|
{
|
||
|
|
kstring_t ind;
|
||
|
|
ind.s = 0; ind.l = ind.m = 0;
|
||
|
|
if (n) {
|
||
|
|
bcf_fmt_t fmt[MAX_N_FMT];
|
||
|
|
int i, j;
|
||
|
|
uint8_t *ptr = (uint8_t*)v->indiv.s;
|
||
|
|
for (i = 0; i < v->n_fmt; ++i)
|
||
|
|
ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
|
||
|
|
for (i = 0; i < (int)v->n_fmt; ++i) {
|
||
|
|
bcf_fmt_t *f = &fmt[i];
|
||
|
|
bcf_enc_int1(&ind, f->id);
|
||
|
|
bcf_enc_size(&ind, f->n, f->type);
|
||
|
|
for (j = 0; j < n; ++j)
|
||
|
|
if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
|
||
|
|
}
|
||
|
|
for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
|
||
|
|
v->n_sample = i;
|
||
|
|
} else v->n_sample = 0;
|
||
|
|
if ( !v->n_sample ) v->n_fmt = 0;
|
||
|
|
free(v->indiv.s);
|
||
|
|
v->indiv = ind;
|
||
|
|
v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_is_snp(bcf1_t *v)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
bcf_unpack(v, BCF_UN_STR);
|
||
|
|
for (i = 0; i < v->n_allele; ++i)
|
||
|
|
{
|
||
|
|
if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
|
||
|
|
|
||
|
|
// mpileup's <X> allele, see also below. This is not completely satisfactory,
|
||
|
|
// a general library is here narrowly tailored to fit samtools.
|
||
|
|
if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
|
||
|
|
if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
|
||
|
|
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
return i == v->n_allele;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
|
||
|
|
{
|
||
|
|
if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; } // overlapping variant
|
||
|
|
|
||
|
|
// The most frequent case
|
||
|
|
if ( !ref[1] && !alt[1] )
|
||
|
|
{
|
||
|
|
if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
|
||
|
|
if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
|
||
|
|
var->n = 1; var->type = VCF_SNP; return;
|
||
|
|
}
|
||
|
|
if ( alt[0]=='<' )
|
||
|
|
{
|
||
|
|
if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
|
||
|
|
if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
|
||
|
|
if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
|
||
|
|
var->type = VCF_OTHER;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Catch "joined before" breakend case
|
||
|
|
if ( alt[0]==']' || alt[0] == '[' )
|
||
|
|
{
|
||
|
|
var->type = VCF_BND; return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Iterate through alt characters that match the reference
|
||
|
|
const char *r = ref, *a = alt;
|
||
|
|
while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed
|
||
|
|
|
||
|
|
if ( *a && !*r )
|
||
|
|
{
|
||
|
|
if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
|
||
|
|
while ( *a ) a++;
|
||
|
|
var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
|
||
|
|
}
|
||
|
|
else if ( *r && !*a )
|
||
|
|
{
|
||
|
|
while ( *r ) r++;
|
||
|
|
var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
|
||
|
|
}
|
||
|
|
else if ( !*r && !*a )
|
||
|
|
{
|
||
|
|
var->n = 0; var->type = VCF_REF; return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const char *re = r, *ae = a;
|
||
|
|
while ( re[1] ) re++;
|
||
|
|
while ( ae[1] ) ae++;
|
||
|
|
while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
|
||
|
|
if ( ae==a )
|
||
|
|
{
|
||
|
|
if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
|
||
|
|
var->n = -(re-r);
|
||
|
|
if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
|
||
|
|
var->type = VCF_OTHER; return;
|
||
|
|
}
|
||
|
|
else if ( re==r )
|
||
|
|
{
|
||
|
|
var->n = ae-a;
|
||
|
|
if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
|
||
|
|
var->type = VCF_OTHER; return;
|
||
|
|
}
|
||
|
|
|
||
|
|
var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
|
||
|
|
var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
|
||
|
|
|
||
|
|
// should do also complex events, SVs, etc...
|
||
|
|
}
|
||
|
|
|
||
|
|
static int bcf_set_variant_types(bcf1_t *b)
|
||
|
|
{
|
||
|
|
if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
|
||
|
|
bcf_dec_t *d = &b->d;
|
||
|
|
if ( d->n_var < b->n_allele )
|
||
|
|
{
|
||
|
|
bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
|
||
|
|
if (!new_var)
|
||
|
|
return -1;
|
||
|
|
d->var = new_var;
|
||
|
|
d->n_var = b->n_allele;
|
||
|
|
}
|
||
|
|
int i;
|
||
|
|
b->d.var_type = 0;
|
||
|
|
d->var[0].type = VCF_REF;
|
||
|
|
d->var[0].n = 0;
|
||
|
|
for (i=1; i<b->n_allele; i++)
|
||
|
|
{
|
||
|
|
bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
|
||
|
|
b->d.var_type |= d->var[i].type;
|
||
|
|
//fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
|
||
|
|
// to be compatible with callers that are not expecting newer values
|
||
|
|
// like VCF_INS, VCF_DEL. The full set is available from the newer
|
||
|
|
// vcf_has_variant_type* interfaces.
|
||
|
|
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
|
||
|
|
int bcf_get_variant_types(bcf1_t *rec)
|
||
|
|
{
|
||
|
|
if ( rec->d.var_type==-1 ) {
|
||
|
|
if (bcf_set_variant_types(rec) != 0) {
|
||
|
|
hts_log_error("Couldn't get variant types: %s", strerror(errno));
|
||
|
|
exit(1); // Due to legacy API having no way to report failures
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return rec->d.var_type & ORIG_VAR_TYPES;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
|
||
|
|
{
|
||
|
|
if ( rec->d.var_type==-1 ) {
|
||
|
|
if (bcf_set_variant_types(rec) != 0) {
|
||
|
|
hts_log_error("Couldn't get variant types: %s", strerror(errno));
|
||
|
|
exit(1); // Due to legacy API having no way to report failures
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (ith_allele < 0 || ith_allele >= rec->n_allele) {
|
||
|
|
hts_log_error("Requested allele outside valid range");
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
|
||
|
|
}
|
||
|
|
#undef ORIG_VAR_TYPES
|
||
|
|
|
||
|
|
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
|
||
|
|
{
|
||
|
|
if ( rec->d.var_type==-1 ) {
|
||
|
|
if (bcf_set_variant_types(rec) != 0) return -1;
|
||
|
|
}
|
||
|
|
if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
|
||
|
|
if (bitmask == VCF_REF) { // VCF_REF is 0, so handled as a special case
|
||
|
|
return rec->d.var[ith_allele].type == VCF_REF;
|
||
|
|
}
|
||
|
|
return bitmask & rec->d.var[ith_allele].type;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_variant_length(bcf1_t *rec, int ith_allele)
|
||
|
|
{
|
||
|
|
if ( rec->d.var_type==-1 ) {
|
||
|
|
if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
|
||
|
|
}
|
||
|
|
if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
|
||
|
|
return rec->d.var[ith_allele].n;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
|
||
|
|
enum bcf_variant_match mode)
|
||
|
|
{
|
||
|
|
if ( rec->d.var_type==-1 ) {
|
||
|
|
if (bcf_set_variant_types(rec) != 0) return -1;
|
||
|
|
}
|
||
|
|
uint32_t type = rec->d.var_type;
|
||
|
|
if ( mode==bcf_match_overlap ) return bitmask & type;
|
||
|
|
|
||
|
|
// VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
|
||
|
|
// ask for say `VCF_INS` or `VCF_INDEL` only
|
||
|
|
if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
|
||
|
|
else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
|
||
|
|
|
||
|
|
if ( mode==bcf_match_subset )
|
||
|
|
{
|
||
|
|
if ( ~bitmask & type ) return 0;
|
||
|
|
else return bitmask & type;
|
||
|
|
}
|
||
|
|
// mode == bcf_match_exact
|
||
|
|
return type==bitmask ? type : 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
|
||
|
|
{
|
||
|
|
static int negative_rlen_warned = 0;
|
||
|
|
int is_end_tag;
|
||
|
|
|
||
|
|
// Is the field already present?
|
||
|
|
int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
|
||
|
|
if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
|
||
|
|
|
||
|
|
is_end_tag = strcmp(key, "END") == 0;
|
||
|
|
|
||
|
|
for (i=0; i<line->n_info; i++)
|
||
|
|
if ( inf_id==line->d.info[i].key ) break;
|
||
|
|
bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
|
||
|
|
|
||
|
|
if ( !n || (type==BCF_HT_STR && !values) )
|
||
|
|
{
|
||
|
|
if ( n==0 && is_end_tag )
|
||
|
|
line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
|
||
|
|
if ( inf )
|
||
|
|
{
|
||
|
|
// Mark the tag for removal, free existing memory if necessary
|
||
|
|
if ( inf->vptr_free )
|
||
|
|
{
|
||
|
|
free(inf->vptr - inf->vptr_off);
|
||
|
|
inf->vptr_free = 0;
|
||
|
|
}
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_INF;
|
||
|
|
inf->vptr = NULL;
|
||
|
|
inf->vptr_off = inf->vptr_len = 0;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (is_end_tag)
|
||
|
|
{
|
||
|
|
if (n != 1)
|
||
|
|
{
|
||
|
|
hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
|
||
|
|
line->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if (type != BCF_HT_INT && type != BCF_HT_LONG)
|
||
|
|
{
|
||
|
|
hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
|
||
|
|
line->errcode |= BCF_ERR_TAG_INVALID;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Encode the values and determine the size required to accommodate the values
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
bcf_enc_int1(&str, inf_id);
|
||
|
|
if ( type==BCF_HT_INT )
|
||
|
|
bcf_enc_vint(&str, n, (int32_t*)values, -1);
|
||
|
|
else if ( type==BCF_HT_REAL )
|
||
|
|
bcf_enc_vfloat(&str, n, (float*)values);
|
||
|
|
else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
|
||
|
|
{
|
||
|
|
if ( values==NULL )
|
||
|
|
bcf_enc_size(&str, 0, BCF_BT_NULL);
|
||
|
|
else
|
||
|
|
bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
|
||
|
|
}
|
||
|
|
#ifdef VCF_ALLOW_INT64
|
||
|
|
else if ( type==BCF_HT_LONG )
|
||
|
|
{
|
||
|
|
if (n != 1) {
|
||
|
|
hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
|
||
|
|
abort();
|
||
|
|
}
|
||
|
|
bcf_enc_long1(&str, *(int64_t *) values);
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
else
|
||
|
|
{
|
||
|
|
hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
|
||
|
|
abort();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Is the INFO tag already present
|
||
|
|
if ( inf )
|
||
|
|
{
|
||
|
|
// Is it big enough to accommodate new block?
|
||
|
|
if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
|
||
|
|
{
|
||
|
|
if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
|
||
|
|
uint8_t *ptr = inf->vptr - inf->vptr_off;
|
||
|
|
memcpy(ptr, str.s, str.l);
|
||
|
|
free(str.s);
|
||
|
|
int vptr_free = inf->vptr_free;
|
||
|
|
bcf_unpack_info_core1(ptr, inf);
|
||
|
|
inf->vptr_free = vptr_free;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
if ( inf->vptr_free )
|
||
|
|
free(inf->vptr - inf->vptr_off);
|
||
|
|
bcf_unpack_info_core1((uint8_t*)str.s, inf);
|
||
|
|
inf->vptr_free = 1;
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_INF;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// The tag is not present, create new one
|
||
|
|
line->n_info++;
|
||
|
|
hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
|
||
|
|
inf = &line->d.info[line->n_info-1];
|
||
|
|
bcf_unpack_info_core1((uint8_t*)str.s, inf);
|
||
|
|
inf->vptr_free = 1;
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_INF;
|
||
|
|
}
|
||
|
|
line->unpacked |= BCF_UN_INFO;
|
||
|
|
|
||
|
|
if ( n==1 && is_end_tag) {
|
||
|
|
hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
|
||
|
|
if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
|
||
|
|
{
|
||
|
|
if ( end <= line->pos )
|
||
|
|
{
|
||
|
|
if ( !negative_rlen_warned )
|
||
|
|
{
|
||
|
|
hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
|
||
|
|
negative_rlen_warned = 1;
|
||
|
|
}
|
||
|
|
line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
line->rlen = end - line->pos;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
|
||
|
|
{
|
||
|
|
if ( !n )
|
||
|
|
return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
|
||
|
|
|
||
|
|
int i, max_len = 0;
|
||
|
|
for (i=0; i<n; i++)
|
||
|
|
{
|
||
|
|
int len = strlen(values[i]);
|
||
|
|
if ( len > max_len ) max_len = len;
|
||
|
|
}
|
||
|
|
char *out = (char*) malloc(max_len*n);
|
||
|
|
if ( !out ) return -2;
|
||
|
|
for (i=0; i<n; i++)
|
||
|
|
{
|
||
|
|
char *dst = out+i*max_len;
|
||
|
|
const char *src = values[i];
|
||
|
|
int j = 0;
|
||
|
|
while ( src[j] ) { dst[j] = src[j]; j++; }
|
||
|
|
for (; j<max_len; j++) dst[j] = 0;
|
||
|
|
}
|
||
|
|
int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
|
||
|
|
free(out);
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
|
||
|
|
{
|
||
|
|
// Is the field already present?
|
||
|
|
int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
|
||
|
|
{
|
||
|
|
if ( !n ) return 0;
|
||
|
|
return -1; // the key not present in the header
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
||
|
|
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
if ( line->d.fmt[i].id==fmt_id ) break;
|
||
|
|
bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
|
||
|
|
|
||
|
|
if ( !n )
|
||
|
|
{
|
||
|
|
if ( fmt )
|
||
|
|
{
|
||
|
|
// Mark the tag for removal, free existing memory if necessary
|
||
|
|
if ( fmt->p_free )
|
||
|
|
{
|
||
|
|
free(fmt->p - fmt->p_off);
|
||
|
|
fmt->p_free = 0;
|
||
|
|
}
|
||
|
|
line->d.indiv_dirty = 1;
|
||
|
|
fmt->p = NULL;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
line->n_sample = bcf_hdr_nsamples(hdr);
|
||
|
|
int nps = n / line->n_sample; // number of values per sample
|
||
|
|
assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
|
||
|
|
|
||
|
|
// Encode the values and determine the size required to accommodate the values
|
||
|
|
kstring_t str = {0,0,0};
|
||
|
|
bcf_enc_int1(&str, fmt_id);
|
||
|
|
assert(values != NULL);
|
||
|
|
if ( type==BCF_HT_INT )
|
||
|
|
bcf_enc_vint(&str, n, (int32_t*)values, nps);
|
||
|
|
else if ( type==BCF_HT_REAL )
|
||
|
|
{
|
||
|
|
bcf_enc_size(&str, nps, BCF_BT_FLOAT);
|
||
|
|
serialize_float_array(&str, nps*line->n_sample, (float *) values);
|
||
|
|
}
|
||
|
|
else if ( type==BCF_HT_STR )
|
||
|
|
{
|
||
|
|
bcf_enc_size(&str, nps, BCF_BT_CHAR);
|
||
|
|
kputsn((char*)values, nps*line->n_sample, &str);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
|
||
|
|
abort();
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( !fmt )
|
||
|
|
{
|
||
|
|
// Not present, new format field
|
||
|
|
line->n_fmt++;
|
||
|
|
hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
|
||
|
|
|
||
|
|
// Special case: VCF specification requires that GT is always first
|
||
|
|
if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
|
||
|
|
{
|
||
|
|
for (i=line->n_fmt-1; i>0; i--)
|
||
|
|
line->d.fmt[i] = line->d.fmt[i-1];
|
||
|
|
fmt = &line->d.fmt[0];
|
||
|
|
}
|
||
|
|
else
|
||
|
|
fmt = &line->d.fmt[line->n_fmt-1];
|
||
|
|
bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
|
||
|
|
line->d.indiv_dirty = 1;
|
||
|
|
fmt->p_free = 1;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// The tag is already present, check if it is big enough to accommodate the new block
|
||
|
|
if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
|
||
|
|
{
|
||
|
|
// good, the block is big enough
|
||
|
|
if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
|
||
|
|
uint8_t *ptr = fmt->p - fmt->p_off;
|
||
|
|
memcpy(ptr, str.s, str.l);
|
||
|
|
free(str.s);
|
||
|
|
int p_free = fmt->p_free;
|
||
|
|
bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
|
||
|
|
fmt->p_free = p_free;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
if ( fmt->p_free )
|
||
|
|
free(fmt->p - fmt->p_off);
|
||
|
|
bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
|
||
|
|
fmt->p_free = 1;
|
||
|
|
line->d.indiv_dirty = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
line->unpacked |= BCF_UN_FMT;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
|
||
|
|
{
|
||
|
|
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
||
|
|
line->d.n_flt = n;
|
||
|
|
if ( !n ) return 0;
|
||
|
|
hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
|
||
|
|
int i;
|
||
|
|
for (i=0; i<n; i++)
|
||
|
|
line->d.flt[i] = flt_ids[i];
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
|
||
|
|
{
|
||
|
|
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
||
|
|
int i;
|
||
|
|
for (i=0; i<line->d.n_flt; i++)
|
||
|
|
if ( flt_id==line->d.flt[i] ) break;
|
||
|
|
if ( i<line->d.n_flt ) return 0; // this filter is already set
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
||
|
|
if ( flt_id==0 ) // set to PASS
|
||
|
|
line->d.n_flt = 1;
|
||
|
|
else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
|
||
|
|
line->d.n_flt = 1;
|
||
|
|
else
|
||
|
|
line->d.n_flt++;
|
||
|
|
hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
|
||
|
|
line->d.flt[line->d.n_flt-1] = flt_id;
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
|
||
|
|
{
|
||
|
|
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
||
|
|
int i;
|
||
|
|
for (i=0; i<line->d.n_flt; i++)
|
||
|
|
if ( flt_id==line->d.flt[i] ) break;
|
||
|
|
if ( i==line->d.n_flt ) return 0; // the filter is not present
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
||
|
|
if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
|
||
|
|
line->d.n_flt--;
|
||
|
|
if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
|
||
|
|
{
|
||
|
|
if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
|
||
|
|
int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
|
||
|
|
|
||
|
|
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
||
|
|
if ( id==0 && !line->d.n_flt) return 1; // PASS
|
||
|
|
|
||
|
|
int i;
|
||
|
|
for (i=0; i<line->d.n_flt; i++)
|
||
|
|
if ( line->d.flt[i]==id ) return 1;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
|
||
|
|
{
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_ALS;
|
||
|
|
|
||
|
|
line->n_allele = nals;
|
||
|
|
hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
|
||
|
|
|
||
|
|
char *als = line->d.als;
|
||
|
|
int n = 0;
|
||
|
|
while (n<nals)
|
||
|
|
{
|
||
|
|
line->d.allele[n] = als;
|
||
|
|
while ( *als ) als++;
|
||
|
|
als++;
|
||
|
|
n++;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Update REF length. Note that END is 1-based while line->pos 0-based
|
||
|
|
bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
|
||
|
|
if ( end_info )
|
||
|
|
{
|
||
|
|
if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
|
||
|
|
else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
|
||
|
|
}
|
||
|
|
if ( end_info && end_info->v1.i > line->pos )
|
||
|
|
line->rlen = end_info->v1.i - line->pos;
|
||
|
|
else if ( nals > 0 )
|
||
|
|
line->rlen = strlen(line->d.allele[0]);
|
||
|
|
else
|
||
|
|
line->rlen = 0;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
|
||
|
|
{
|
||
|
|
if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
|
||
|
|
char *free_old = NULL;
|
||
|
|
char buffer[256];
|
||
|
|
size_t used = 0;
|
||
|
|
|
||
|
|
// The pointers in alleles may point into the existing line->d.als memory,
|
||
|
|
// so care needs to be taken not to clobber them while updating. Usually
|
||
|
|
// they will be short so we can copy through an intermediate buffer.
|
||
|
|
// If they're longer, or won't fit in the existing allocation we
|
||
|
|
// can allocate a new buffer to write into. Note that in either case
|
||
|
|
// pointers to line->d.als memory in alleles may not be valid when we've
|
||
|
|
// finished.
|
||
|
|
int i;
|
||
|
|
size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
|
||
|
|
for (i=0; i<nals; i++) {
|
||
|
|
size_t sz = strlen(alleles[i]) + 1;
|
||
|
|
if (avail - used < sz)
|
||
|
|
break;
|
||
|
|
memcpy(buffer + used, alleles[i], sz);
|
||
|
|
used += sz;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Did we miss anything?
|
||
|
|
if (i < nals) {
|
||
|
|
int j;
|
||
|
|
size_t needed = used;
|
||
|
|
char *new_als;
|
||
|
|
for (j = i; j < nals; j++)
|
||
|
|
needed += strlen(alleles[j]) + 1;
|
||
|
|
if (needed < line->d.m_als) // Don't shrink the buffer
|
||
|
|
needed = line->d.m_als;
|
||
|
|
if (needed > INT_MAX) {
|
||
|
|
hts_log_error("REF + alleles too long to fit in a BCF record");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
new_als = malloc(needed);
|
||
|
|
if (!new_als)
|
||
|
|
return -1;
|
||
|
|
free_old = line->d.als;
|
||
|
|
line->d.als = new_als;
|
||
|
|
line->d.m_als = needed;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Copy from the temp buffer to the destination
|
||
|
|
if (used) {
|
||
|
|
assert(used <= line->d.m_als);
|
||
|
|
memcpy(line->d.als, buffer, used);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Add in any remaining entries - if this happens we will always be
|
||
|
|
// writing to a newly-allocated buffer.
|
||
|
|
for (; i < nals; i++) {
|
||
|
|
size_t sz = strlen(alleles[i]) + 1;
|
||
|
|
memcpy(line->d.als + used, alleles[i], sz);
|
||
|
|
used += sz;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (free_old)
|
||
|
|
free(free_old);
|
||
|
|
return _bcf1_sync_alleles(hdr,line,nals);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
|
||
|
|
{
|
||
|
|
if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
|
||
|
|
kstring_t tmp;
|
||
|
|
tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
|
||
|
|
kputs(alleles_string, &tmp);
|
||
|
|
line->d.als = tmp.s; line->d.m_als = tmp.m;
|
||
|
|
|
||
|
|
int nals = 1;
|
||
|
|
char *t = line->d.als;
|
||
|
|
while (*t)
|
||
|
|
{
|
||
|
|
if ( *t==',' ) { *t = 0; nals++; }
|
||
|
|
t++;
|
||
|
|
}
|
||
|
|
return _bcf1_sync_alleles(hdr, line, nals);
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
|
||
|
|
{
|
||
|
|
if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
|
||
|
|
kstring_t tmp;
|
||
|
|
tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
|
||
|
|
if ( id )
|
||
|
|
kputs(id, &tmp);
|
||
|
|
else
|
||
|
|
kputs(".", &tmp);
|
||
|
|
line->d.id = tmp.s; line->d.m_id = tmp.m;
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_ID;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
|
||
|
|
{
|
||
|
|
if ( !id ) return 0;
|
||
|
|
if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
|
||
|
|
|
||
|
|
kstring_t tmp;
|
||
|
|
tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
|
||
|
|
|
||
|
|
int len = strlen(id);
|
||
|
|
char *dst = line->d.id;
|
||
|
|
while ( *dst && (dst=strstr(dst,id)) )
|
||
|
|
{
|
||
|
|
if ( dst[len]!=0 && dst[len]!=';' ) dst++; // a prefix, not a match
|
||
|
|
else if ( dst==line->d.id || dst[-1]==';' ) return 0; // already present
|
||
|
|
dst++; // a suffix, not a match
|
||
|
|
}
|
||
|
|
if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
|
||
|
|
{
|
||
|
|
tmp.l = strlen(line->d.id);
|
||
|
|
kputc(';',&tmp);
|
||
|
|
}
|
||
|
|
kputs(id,&tmp);
|
||
|
|
|
||
|
|
line->d.id = tmp.s; line->d.m_id = tmp.m;
|
||
|
|
line->d.shared_dirty |= BCF1_DIRTY_ID;
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
|
||
|
|
{
|
||
|
|
int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
|
||
|
|
return bcf_get_fmt_id(line, id);
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
|
||
|
|
{
|
||
|
|
int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
|
||
|
|
return bcf_get_info_id(line, id);
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
{
|
||
|
|
if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
|
||
|
|
}
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
|
||
|
|
for (i=0; i<line->n_info; i++)
|
||
|
|
{
|
||
|
|
if ( line->d.info[i].key==id ) return &line->d.info[i];
|
||
|
|
}
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
|
||
|
|
{
|
||
|
|
int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
|
||
|
|
if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type
|
||
|
|
|
||
|
|
if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
|
||
|
|
|
||
|
|
for (i=0; i<line->n_info; i++)
|
||
|
|
if ( line->d.info[i].key==tag_id ) break;
|
||
|
|
if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
|
||
|
|
if ( type==BCF_HT_FLAG ) return 1;
|
||
|
|
|
||
|
|
bcf_info_t *info = &line->d.info[i];
|
||
|
|
if ( !info->vptr ) return -3; // the tag was marked for removal
|
||
|
|
if ( type==BCF_HT_STR )
|
||
|
|
{
|
||
|
|
if ( *ndst < info->len+1 )
|
||
|
|
{
|
||
|
|
*ndst = info->len + 1;
|
||
|
|
*dst = realloc(*dst, *ndst);
|
||
|
|
}
|
||
|
|
memcpy(*dst,info->vptr,info->len);
|
||
|
|
((uint8_t*)*dst)[info->len] = 0;
|
||
|
|
return info->len;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Make sure the buffer is big enough
|
||
|
|
int size1;
|
||
|
|
switch (type) {
|
||
|
|
case BCF_HT_INT: size1 = sizeof(int32_t); break;
|
||
|
|
case BCF_HT_LONG: size1 = sizeof(int64_t); break;
|
||
|
|
case BCF_HT_REAL: size1 = sizeof(float); break;
|
||
|
|
default:
|
||
|
|
hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
|
||
|
|
return -2;
|
||
|
|
}
|
||
|
|
if ( *ndst < info->len )
|
||
|
|
{
|
||
|
|
*ndst = info->len;
|
||
|
|
*dst = realloc(*dst, *ndst * size1);
|
||
|
|
}
|
||
|
|
|
||
|
|
#define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
|
||
|
|
out_type_t *tmp = (out_type_t *) *dst; \
|
||
|
|
int j; \
|
||
|
|
for (j=0; j<info->len; j++) \
|
||
|
|
{ \
|
||
|
|
type_t p = convert(info->vptr + j * sizeof(type_t)); \
|
||
|
|
if ( is_vector_end ) break; \
|
||
|
|
if ( is_missing ) set_missing; \
|
||
|
|
else set_regular; \
|
||
|
|
tmp++; \
|
||
|
|
} \
|
||
|
|
ret = j; \
|
||
|
|
} while (0)
|
||
|
|
switch (info->type) {
|
||
|
|
case BCF_BT_INT8:
|
||
|
|
if (type == BCF_HT_LONG) {
|
||
|
|
BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
|
||
|
|
} else {
|
||
|
|
BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
case BCF_BT_INT16:
|
||
|
|
if (type == BCF_HT_LONG) {
|
||
|
|
BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
|
||
|
|
} else {
|
||
|
|
BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
case BCF_BT_INT32:
|
||
|
|
if (type == BCF_HT_LONG) {
|
||
|
|
BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
|
||
|
|
} else {
|
||
|
|
BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
|
||
|
|
}
|
||
|
|
case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
|
||
|
|
default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
|
||
|
|
}
|
||
|
|
#undef BRANCH
|
||
|
|
return ret; // set by BRANCH
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
|
||
|
|
{
|
||
|
|
int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
|
||
|
|
if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
|
||
|
|
|
||
|
|
if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
||
|
|
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
if ( line->d.fmt[i].id==tag_id ) break;
|
||
|
|
if ( i==line->n_fmt ) return -3; // the tag is not present in this record
|
||
|
|
bcf_fmt_t *fmt = &line->d.fmt[i];
|
||
|
|
if ( !fmt->p ) return -3; // the tag was marked for removal
|
||
|
|
|
||
|
|
int nsmpl = bcf_hdr_nsamples(hdr);
|
||
|
|
if ( !*dst )
|
||
|
|
{
|
||
|
|
*dst = (char**) malloc(sizeof(char*)*nsmpl);
|
||
|
|
if ( !*dst ) return -4; // could not alloc
|
||
|
|
(*dst)[0] = NULL;
|
||
|
|
}
|
||
|
|
int n = (fmt->n+1)*nsmpl;
|
||
|
|
if ( *ndst < n )
|
||
|
|
{
|
||
|
|
(*dst)[0] = realloc((*dst)[0], n);
|
||
|
|
if ( !(*dst)[0] ) return -4; // could not alloc
|
||
|
|
*ndst = n;
|
||
|
|
}
|
||
|
|
for (i=0; i<nsmpl; i++)
|
||
|
|
{
|
||
|
|
uint8_t *src = fmt->p + i*fmt->n;
|
||
|
|
uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
|
||
|
|
memcpy(tmp,src,fmt->n);
|
||
|
|
tmp[fmt->n] = 0;
|
||
|
|
(*dst)[i] = (char*) tmp;
|
||
|
|
}
|
||
|
|
return n;
|
||
|
|
}
|
||
|
|
|
||
|
|
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
|
||
|
|
{
|
||
|
|
int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
|
||
|
|
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
|
||
|
|
if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
|
||
|
|
{
|
||
|
|
// Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
|
||
|
|
if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
|
||
|
|
}
|
||
|
|
else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
|
||
|
|
|
||
|
|
if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
||
|
|
|
||
|
|
for (i=0; i<line->n_fmt; i++)
|
||
|
|
if ( line->d.fmt[i].id==tag_id ) break;
|
||
|
|
if ( i==line->n_fmt ) return -3; // the tag is not present in this record
|
||
|
|
bcf_fmt_t *fmt = &line->d.fmt[i];
|
||
|
|
if ( !fmt->p ) return -3; // the tag was marked for removal
|
||
|
|
|
||
|
|
if ( type==BCF_HT_STR )
|
||
|
|
{
|
||
|
|
int n = fmt->n*bcf_hdr_nsamples(hdr);
|
||
|
|
if ( *ndst < n )
|
||
|
|
{
|
||
|
|
*dst = realloc(*dst, n);
|
||
|
|
if ( !*dst ) return -4; // could not alloc
|
||
|
|
*ndst = n;
|
||
|
|
}
|
||
|
|
memcpy(*dst,fmt->p,n);
|
||
|
|
return n;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Make sure the buffer is big enough
|
||
|
|
int nsmpl = bcf_hdr_nsamples(hdr);
|
||
|
|
int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
|
||
|
|
if ( *ndst < fmt->n*nsmpl )
|
||
|
|
{
|
||
|
|
*ndst = fmt->n*nsmpl;
|
||
|
|
*dst = realloc(*dst, *ndst*size1);
|
||
|
|
if ( !*dst ) return -4; // could not alloc
|
||
|
|
}
|
||
|
|
|
||
|
|
#define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
|
||
|
|
out_type_t *tmp = (out_type_t *) *dst; \
|
||
|
|
uint8_t *fmt_p = fmt->p; \
|
||
|
|
for (i=0; i<nsmpl; i++) \
|
||
|
|
{ \
|
||
|
|
for (j=0; j<fmt->n; j++) \
|
||
|
|
{ \
|
||
|
|
type_t p = convert(fmt_p + j * sizeof(type_t)); \
|
||
|
|
if ( is_missing ) set_missing; \
|
||
|
|
else if ( is_vector_end ) { set_vector_end; break; } \
|
||
|
|
else set_regular; \
|
||
|
|
tmp++; \
|
||
|
|
} \
|
||
|
|
for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
|
||
|
|
fmt_p += fmt->size; \
|
||
|
|
} \
|
||
|
|
}
|
||
|
|
switch (fmt->type) {
|
||
|
|
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
|
||
|
|
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
|
||
|
|
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
|
||
|
|
case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
|
||
|
|
default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
|
||
|
|
}
|
||
|
|
#undef BRANCH
|
||
|
|
return nsmpl*fmt->n;
|
||
|
|
}
|
||
|
|
|
||
|
|
//error description structure definition
|
||
|
|
typedef struct err_desc {
|
||
|
|
int errorcode;
|
||
|
|
const char *description;
|
||
|
|
}err_desc;
|
||
|
|
|
||
|
|
// error descriptions
|
||
|
|
static const err_desc errdesc_bcf[] = {
|
||
|
|
{ BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
|
||
|
|
{ BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
|
||
|
|
{ BCF_ERR_NCOLS, "Incorrect number of columns" },
|
||
|
|
{ BCF_ERR_LIMITS, "Limits reached" },
|
||
|
|
{ BCF_ERR_CHAR, "Invalid character" },
|
||
|
|
{ BCF_ERR_CTG_INVALID, "Invalid contig" },
|
||
|
|
{ BCF_ERR_TAG_INVALID, "Invalid tag" },
|
||
|
|
};
|
||
|
|
|
||
|
|
/// append given description to buffer based on available size and add ... when not enough space
|
||
|
|
/** @param buffer buffer to which description to be appended
|
||
|
|
@param offset offset at which to be appended
|
||
|
|
@param maxbuffer maximum size of the buffer
|
||
|
|
@param description the description to be appended
|
||
|
|
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
|
||
|
|
on success returns 0
|
||
|
|
*/
|
||
|
|
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
|
||
|
|
|
||
|
|
if (!description || !buffer || !offset || (maxbuffer < 4))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
size_t rembuffer = maxbuffer - *offset;
|
||
|
|
if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) { //add description with optionally required ','
|
||
|
|
*offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
|
||
|
|
} else { //not enough space for description, put ...
|
||
|
|
size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
|
||
|
|
snprintf(buffer + tmppos, 4, "..."); //ignore offset update
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
//get description for given error code. return NULL on error
|
||
|
|
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
|
||
|
|
size_t usedup = 0;
|
||
|
|
int ret = 0;
|
||
|
|
int idx;
|
||
|
|
|
||
|
|
if (!buffer || maxbuffer < 4)
|
||
|
|
return NULL; //invalid / insufficient buffer
|
||
|
|
|
||
|
|
if (!errorcode) {
|
||
|
|
buffer[0] = '\0'; //no error, set null
|
||
|
|
return buffer;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
|
||
|
|
if (errorcode & errdesc_bcf[idx].errorcode) { //error is set, add description
|
||
|
|
ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
|
||
|
|
if (ret < 0)
|
||
|
|
break; //not enough space, ... added, no need to continue
|
||
|
|
|
||
|
|
errorcode &= ~errdesc_bcf[idx].errorcode; //reset the error
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (errorcode && (ret >= 0)) { //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
|
||
|
|
add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
|
||
|
|
}
|
||
|
|
return buffer;
|
||
|
|
}
|
||
|
|
|