1041 lines
29 KiB
C
1041 lines
29 KiB
C
|
|
/*
|
||
|
|
Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd.
|
||
|
|
Author: James Bonfield <jkb@sanger.ac.uk>
|
||
|
|
|
||
|
|
Redistribution and use in source and binary forms, with or without
|
||
|
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
|
||
|
|
1. Redistributions of source code must retain the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer.
|
||
|
|
|
||
|
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer in the documentation
|
||
|
|
and/or other materials provided with the distribution.
|
||
|
|
|
||
|
|
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
||
|
|
Institute nor the names of its contributors may be used to endorse or promote
|
||
|
|
products derived from this software without specific prior written permission.
|
||
|
|
|
||
|
|
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
||
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
|
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
||
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
* The index is a gzipped tab-delimited text file with one line per slice.
|
||
|
|
* The columns are:
|
||
|
|
* 1: reference number (0 to N-1, as per BAM ref_id)
|
||
|
|
* 2: reference position of 1st read in slice (1..?)
|
||
|
|
* 3: number of reads in slice
|
||
|
|
* 4: offset of container start (relative to end of SAM header, so 1st
|
||
|
|
* container is offset 0).
|
||
|
|
* 5: slice number within container (ie which landmark).
|
||
|
|
*
|
||
|
|
* In memory, we hold this in a nested containment list. Each list element is
|
||
|
|
* a cram_index struct. Each element in turn can contain its own list of
|
||
|
|
* cram_index structs.
|
||
|
|
*
|
||
|
|
* Any start..end range which is entirely contained within another (and
|
||
|
|
* earlier as it is sorted) range will be held within it. This ensures that
|
||
|
|
* the outer list will never have containments and we can safely do a
|
||
|
|
* binary search to find the first range which overlaps any given coordinate.
|
||
|
|
*/
|
||
|
|
|
||
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
||
|
|
#include <config.h>
|
||
|
|
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <errno.h>
|
||
|
|
#include <assert.h>
|
||
|
|
#include <inttypes.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include <zlib.h>
|
||
|
|
#include <sys/types.h>
|
||
|
|
#include <sys/stat.h>
|
||
|
|
#include <math.h>
|
||
|
|
|
||
|
|
#include "../htslib/bgzf.h"
|
||
|
|
#include "../htslib/hfile.h"
|
||
|
|
#include "../hts_internal.h"
|
||
|
|
#include "cram.h"
|
||
|
|
#include "os.h"
|
||
|
|
|
||
|
|
#if 0
|
||
|
|
static void dump_index_(cram_index *e, int level) {
|
||
|
|
int i, n;
|
||
|
|
n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
|
||
|
|
printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next);
|
||
|
|
for (i = 0; i < e->nslice; i++) {
|
||
|
|
dump_index_(&e->e[i], level+1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
static void dump_index(cram_fd *fd) {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < fd->index_sz; i++) {
|
||
|
|
dump_index_(&fd->index[i], 0);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
|
||
|
|
// Thread a linked list through the nested containment list.
|
||
|
|
// This makes navigating it and finding the "next" index entry
|
||
|
|
// trivial.
|
||
|
|
static cram_index *link_index_(cram_index *e, cram_index *e_last) {
|
||
|
|
int i;
|
||
|
|
if (e_last)
|
||
|
|
e_last->e_next = e;
|
||
|
|
|
||
|
|
// We don't want to link in the top-level cram_index with
|
||
|
|
// offset=0 and start/end = INT_MIN/INT_MAX.
|
||
|
|
if (e->offset)
|
||
|
|
e_last = e;
|
||
|
|
|
||
|
|
for (i = 0; i < e->nslice; i++)
|
||
|
|
e_last = link_index_(&e->e[i], e_last);
|
||
|
|
|
||
|
|
return e_last;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void link_index(cram_fd *fd) {
|
||
|
|
int i;
|
||
|
|
cram_index *e_last = NULL;
|
||
|
|
|
||
|
|
for (i = 0; i < fd->index_sz; i++) {
|
||
|
|
e_last = link_index_(&fd->index[i], e_last);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (e_last)
|
||
|
|
e_last->e_next = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
|
||
|
|
int sign = 1;
|
||
|
|
int32_t val = 0;
|
||
|
|
size_t p = *pos;
|
||
|
|
|
||
|
|
while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
|
||
|
|
p++;
|
||
|
|
|
||
|
|
if (p < k->l && k->s[p] == '-')
|
||
|
|
sign = -1, p++;
|
||
|
|
|
||
|
|
if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
|
||
|
|
int digit = k->s[p++]-'0';
|
||
|
|
val = val*10 + digit;
|
||
|
|
}
|
||
|
|
|
||
|
|
*pos = p;
|
||
|
|
*val_p = sign*val;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
|
||
|
|
int sign = 1;
|
||
|
|
int64_t val = 0;
|
||
|
|
size_t p = *pos;
|
||
|
|
|
||
|
|
while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
|
||
|
|
p++;
|
||
|
|
|
||
|
|
if (p < k->l && k->s[p] == '-')
|
||
|
|
sign = -1, p++;
|
||
|
|
|
||
|
|
if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
|
||
|
|
int digit = k->s[p++]-'0';
|
||
|
|
val = val*10 + digit;
|
||
|
|
}
|
||
|
|
|
||
|
|
*pos = p;
|
||
|
|
*val_p = sign*val;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Loads a CRAM .crai index into memory.
|
||
|
|
*
|
||
|
|
* Returns 0 for success
|
||
|
|
* -1 for failure
|
||
|
|
*/
|
||
|
|
int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
|
||
|
|
|
||
|
|
char *tfn_idx = NULL;
|
||
|
|
char buf[65536];
|
||
|
|
ssize_t len;
|
||
|
|
kstring_t kstr = {0};
|
||
|
|
hFILE *fp;
|
||
|
|
cram_index *idx;
|
||
|
|
cram_index **idx_stack = NULL, *ep, e;
|
||
|
|
int idx_stack_alloc = 0, idx_stack_ptr = 0;
|
||
|
|
size_t pos = 0;
|
||
|
|
|
||
|
|
/* Check if already loaded */
|
||
|
|
if (fd->index)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
|
||
|
|
if (!fd->index)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
idx = &fd->index[0];
|
||
|
|
idx->refid = -1;
|
||
|
|
idx->start = INT_MIN;
|
||
|
|
idx->end = INT_MAX;
|
||
|
|
|
||
|
|
idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
|
||
|
|
if (!idx_stack)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
idx_stack[idx_stack_ptr] = idx;
|
||
|
|
|
||
|
|
// Support pathX.cram##idx##pathY.crai
|
||
|
|
const char *fn_delim = strstr(fn, HTS_IDX_DELIM);
|
||
|
|
if (fn_delim && !fn_idx)
|
||
|
|
fn_idx = fn_delim + strlen(HTS_IDX_DELIM);
|
||
|
|
|
||
|
|
if (!fn_idx) {
|
||
|
|
if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn))
|
||
|
|
tfn_idx = hts_idx_getfn(fn, ".crai");
|
||
|
|
|
||
|
|
if (!tfn_idx) {
|
||
|
|
hts_log_error("Could not retrieve index file for '%s'", fn);
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
fn_idx = tfn_idx;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(fp = hopen(fn_idx, "r"))) {
|
||
|
|
hts_log_error("Could not open index file '%s'", fn_idx);
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Load the file into memory
|
||
|
|
while ((len = hread(fp, buf, sizeof(buf))) > 0) {
|
||
|
|
if (kputsn(buf, len, &kstr) < 0)
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (len < 0 || kstr.l < 2)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (hclose(fp) < 0)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
// Uncompress if required
|
||
|
|
if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
|
||
|
|
size_t l = 0;
|
||
|
|
char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
|
||
|
|
if (!s)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
free(kstr.s);
|
||
|
|
kstr.s = s;
|
||
|
|
kstr.l = l;
|
||
|
|
kstr.m = l; // conservative estimate of the size allocated
|
||
|
|
if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
// Parse it line at a time
|
||
|
|
while (pos < kstr.l) {
|
||
|
|
/* 1.1 layout */
|
||
|
|
if (kget_int32(&kstr, &pos, &e.refid) == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (kget_int32(&kstr, &pos, &e.start) == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (kget_int32(&kstr, &pos, &e.end) == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (kget_int64(&kstr, &pos, &e.offset) == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (kget_int32(&kstr, &pos, &e.slice) == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (kget_int32(&kstr, &pos, &e.len) == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
e.end += e.start-1;
|
||
|
|
//printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice);
|
||
|
|
|
||
|
|
if (e.refid < -1) {
|
||
|
|
hts_log_error("Malformed index file, refid %d", e.refid);
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (e.refid != idx->refid) {
|
||
|
|
if (fd->index_sz < e.refid+2) {
|
||
|
|
cram_index *new_idx;
|
||
|
|
int new_sz = e.refid+2;
|
||
|
|
size_t index_end = fd->index_sz * sizeof(*fd->index);
|
||
|
|
new_idx = realloc(fd->index,
|
||
|
|
new_sz * sizeof(*fd->index));
|
||
|
|
if (!new_idx)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
fd->index = new_idx;
|
||
|
|
fd->index_sz = new_sz;
|
||
|
|
memset(((char *)fd->index) + index_end, 0,
|
||
|
|
fd->index_sz * sizeof(*fd->index) - index_end);
|
||
|
|
}
|
||
|
|
idx = &fd->index[e.refid+1];
|
||
|
|
idx->refid = e.refid;
|
||
|
|
idx->start = INT_MIN;
|
||
|
|
idx->end = INT_MAX;
|
||
|
|
idx->nslice = idx->nalloc = 0;
|
||
|
|
idx->e = NULL;
|
||
|
|
idx_stack[(idx_stack_ptr = 0)] = idx;
|
||
|
|
}
|
||
|
|
|
||
|
|
while (!(e.start >= idx->start && e.end <= idx->end) ||
|
||
|
|
(idx->start == 0 && idx->refid == -1)) {
|
||
|
|
idx = idx_stack[--idx_stack_ptr];
|
||
|
|
}
|
||
|
|
|
||
|
|
// Now contains, so append
|
||
|
|
if (idx->nslice+1 >= idx->nalloc) {
|
||
|
|
cram_index *new_e;
|
||
|
|
idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
|
||
|
|
new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
|
||
|
|
if (!new_e)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
idx->e = new_e;
|
||
|
|
}
|
||
|
|
|
||
|
|
e.nalloc = e.nslice = 0; e.e = NULL;
|
||
|
|
*(ep = &idx->e[idx->nslice++]) = e;
|
||
|
|
idx = ep;
|
||
|
|
|
||
|
|
if (++idx_stack_ptr >= idx_stack_alloc) {
|
||
|
|
cram_index **new_stack;
|
||
|
|
idx_stack_alloc *= 2;
|
||
|
|
new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
|
||
|
|
if (!new_stack)
|
||
|
|
goto fail;
|
||
|
|
idx_stack = new_stack;
|
||
|
|
}
|
||
|
|
idx_stack[idx_stack_ptr] = idx;
|
||
|
|
|
||
|
|
while (pos < kstr.l && kstr.s[pos] != '\n')
|
||
|
|
pos++;
|
||
|
|
pos++;
|
||
|
|
}
|
||
|
|
|
||
|
|
free(idx_stack);
|
||
|
|
free(kstr.s);
|
||
|
|
free(tfn_idx);
|
||
|
|
|
||
|
|
// Convert NCList to linear linked list
|
||
|
|
link_index(fd);
|
||
|
|
|
||
|
|
//dump_index(fd);
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
fail:
|
||
|
|
free(kstr.s);
|
||
|
|
free(idx_stack);
|
||
|
|
free(tfn_idx);
|
||
|
|
cram_index_free(fd); // Also sets fd->index = NULL
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void cram_index_free_recurse(cram_index *e) {
|
||
|
|
if (e->e) {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < e->nslice; i++) {
|
||
|
|
cram_index_free_recurse(&e->e[i]);
|
||
|
|
}
|
||
|
|
free(e->e);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void cram_index_free(cram_fd *fd) {
|
||
|
|
int i;
|
||
|
|
|
||
|
|
if (!fd->index)
|
||
|
|
return;
|
||
|
|
|
||
|
|
for (i = 0; i < fd->index_sz; i++) {
|
||
|
|
cram_index_free_recurse(&fd->index[i]);
|
||
|
|
}
|
||
|
|
free(fd->index);
|
||
|
|
|
||
|
|
fd->index = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Searches the index for the first slice overlapping a reference ID
|
||
|
|
* and position, or one immediately preceding it if none is found in
|
||
|
|
* the index to overlap this position. (Our index may have missing
|
||
|
|
* entries, but we require at least one per reference.)
|
||
|
|
*
|
||
|
|
* If the index finds multiple slices overlapping this position we
|
||
|
|
* return the first one only. Subsequent calls should specify
|
||
|
|
* "from" as the last slice we checked to find the next one. Otherwise
|
||
|
|
* set "from" to be NULL to find the first one.
|
||
|
|
*
|
||
|
|
* Refid can also be any of the special HTS_IDX_ values.
|
||
|
|
* For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR.
|
||
|
|
*
|
||
|
|
* Returns the cram_index pointer on success
|
||
|
|
* NULL on failure
|
||
|
|
*/
|
||
|
|
cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
|
||
|
|
cram_index *from) {
|
||
|
|
int i, j, k;
|
||
|
|
cram_index *e;
|
||
|
|
|
||
|
|
if (from) {
|
||
|
|
// Continue from a previous search.
|
||
|
|
// We switch to just scanning the linked list, as the nested
|
||
|
|
// lists are typically short.
|
||
|
|
if (refid == HTS_IDX_NOCOOR)
|
||
|
|
refid = -1;
|
||
|
|
|
||
|
|
e = from->e_next;
|
||
|
|
if (e && e->refid == refid && e->start <= pos)
|
||
|
|
return e;
|
||
|
|
else
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
switch(refid) {
|
||
|
|
case HTS_IDX_NONE:
|
||
|
|
case HTS_IDX_REST:
|
||
|
|
// fail, or already there, dealt with elsewhere.
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
case -1:
|
||
|
|
case HTS_IDX_NOCOOR:
|
||
|
|
refid = -1;
|
||
|
|
pos = 0;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case HTS_IDX_START: {
|
||
|
|
int64_t min_idx = INT64_MAX;
|
||
|
|
for (i = 0, j = -1; i < fd->index_sz; i++) {
|
||
|
|
if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) {
|
||
|
|
min_idx = fd->index[i].e[0].offset;
|
||
|
|
j = i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (j < 0)
|
||
|
|
return NULL;
|
||
|
|
return fd->index[j].e;
|
||
|
|
}
|
||
|
|
|
||
|
|
default:
|
||
|
|
if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz)
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
from = &fd->index[refid+1];
|
||
|
|
|
||
|
|
// Ref with nothing aligned against it.
|
||
|
|
if (!from->e)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
// This sequence is covered by the index, so binary search to find
|
||
|
|
// the optimal starting block.
|
||
|
|
i = 0, j = fd->index[refid+1].nslice-1;
|
||
|
|
for (k = j/2; k != i; k = (j-i)/2 + i) {
|
||
|
|
if (from->e[k].refid > refid) {
|
||
|
|
j = k;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (from->e[k].refid < refid) {
|
||
|
|
i = k;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (from->e[k].start >= pos) {
|
||
|
|
j = k;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (from->e[k].start < pos) {
|
||
|
|
i = k;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// i==j or i==j-1. Check if j is better.
|
||
|
|
if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
|
||
|
|
i = j;
|
||
|
|
|
||
|
|
/* The above found *a* bin overlapping, but not necessarily the first */
|
||
|
|
while (i > 0 && from->e[i-1].end >= pos)
|
||
|
|
i--;
|
||
|
|
|
||
|
|
/* We may be one bin before the optimum, so check */
|
||
|
|
while (i+1 < from->nslice &&
|
||
|
|
(from->e[i].refid < refid ||
|
||
|
|
from->e[i].end < pos))
|
||
|
|
i++;
|
||
|
|
|
||
|
|
e = &from->e[i];
|
||
|
|
|
||
|
|
return e;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Return the index entry for last slice on a specific reference.
|
||
|
|
cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
|
||
|
|
int slice;
|
||
|
|
|
||
|
|
if (refid+1 < 0 || refid+1 >= fd->index_sz)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
if (!from)
|
||
|
|
from = &fd->index[refid+1];
|
||
|
|
|
||
|
|
// Ref with nothing aligned against it.
|
||
|
|
if (!from->e)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
slice = fd->index[refid+1].nslice - 1;
|
||
|
|
|
||
|
|
// e is the last entry in the nested containment list, but it may
|
||
|
|
// contain further slices within it.
|
||
|
|
cram_index *e = &from->e[slice];
|
||
|
|
while (e->e_next)
|
||
|
|
e = e->e_next;
|
||
|
|
|
||
|
|
return e;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Find the last container overlapping pos 'end', and the file offset of
|
||
|
|
* its end (equivalent to the start offset of the container following it).
|
||
|
|
*/
|
||
|
|
cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
|
||
|
|
cram_index *e = NULL, *prev_e;
|
||
|
|
do {
|
||
|
|
prev_e = e;
|
||
|
|
e = cram_index_query(fd, refid, end, prev_e);
|
||
|
|
} while (e);
|
||
|
|
|
||
|
|
if (!prev_e)
|
||
|
|
return NULL;
|
||
|
|
e = prev_e;
|
||
|
|
|
||
|
|
// Note: offset of e and e->e_next may be the same if we're using a
|
||
|
|
// multi-ref container where a single container generates multiple
|
||
|
|
// index entries.
|
||
|
|
//
|
||
|
|
// We need to keep iterating until offset differs in order to find
|
||
|
|
// the genuine file offset for the end of container.
|
||
|
|
do {
|
||
|
|
prev_e = e;
|
||
|
|
e = e->e_next;
|
||
|
|
} while (e && e->offset == prev_e->offset);
|
||
|
|
|
||
|
|
return prev_e;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Skips to a container overlapping the start coordinate listed in
|
||
|
|
* cram_range.
|
||
|
|
*
|
||
|
|
* In theory we call cram_index_query multiple times, once per slice
|
||
|
|
* overlapping the range. However slices may be absent from the index
|
||
|
|
* which makes this problematic. Instead we find the left-most slice
|
||
|
|
* and then read from then on, skipping decoding of slices and/or
|
||
|
|
* whole containers when they don't overlap the specified cram_range.
|
||
|
|
*
|
||
|
|
* This function also updates the cram_fd range field.
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on general failure
|
||
|
|
* -2 on no-data (empty chromosome)
|
||
|
|
*/
|
||
|
|
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
|
||
|
|
int ret = 0;
|
||
|
|
cram_index *e;
|
||
|
|
|
||
|
|
if (r->refid == HTS_IDX_NONE) {
|
||
|
|
ret = -2; goto err;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Ideally use an index, so see if we have one.
|
||
|
|
if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
|
||
|
|
if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
|
||
|
|
if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) {
|
||
|
|
ret = -1; goto err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// Absent from index, but this most likely means it simply has no data.
|
||
|
|
ret = -2; goto err;
|
||
|
|
}
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->range_lock);
|
||
|
|
fd->range = *r;
|
||
|
|
if (r->refid == HTS_IDX_NOCOOR) {
|
||
|
|
fd->range.refid = -1;
|
||
|
|
fd->range.start = 0;
|
||
|
|
} else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
|
||
|
|
fd->range.refid = -2; // special case in cram_next_slice
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->range_lock);
|
||
|
|
|
||
|
|
if (fd->ctr) {
|
||
|
|
cram_free_container(fd->ctr);
|
||
|
|
if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
|
||
|
|
cram_free_container(fd->ctr_mt);
|
||
|
|
fd->ctr = NULL;
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
fd->ooc = 0;
|
||
|
|
fd->eof = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
err:
|
||
|
|
// It's unlikely fd->range will be accessed after EOF or error,
|
||
|
|
// but this maintains identical behaviour to the previous code.
|
||
|
|
pthread_mutex_lock(&fd->range_lock);
|
||
|
|
fd->range = *r;
|
||
|
|
pthread_mutex_unlock(&fd->range_lock);
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* A specialised form of cram_index_build (below) that deals with slices
|
||
|
|
* having multiple references in this (ref_id -2). In this scenario we
|
||
|
|
* decode the slice to look at the RI data series instead.
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on read failure
|
||
|
|
* -2 on wrong sort order
|
||
|
|
* -4 on write failure
|
||
|
|
*/
|
||
|
|
static int cram_index_build_multiref(cram_fd *fd,
|
||
|
|
cram_container *c,
|
||
|
|
cram_slice *s,
|
||
|
|
BGZF *fp,
|
||
|
|
off_t cpos,
|
||
|
|
int32_t landmark,
|
||
|
|
int sz) {
|
||
|
|
int i, ref = -2;
|
||
|
|
int64_t ref_start = 0, ref_end;
|
||
|
|
char buf[1024];
|
||
|
|
|
||
|
|
if (fd->mode != 'w') {
|
||
|
|
if (0 != cram_decode_slice(fd, c, s, fd->header))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
ref_end = INT_MIN;
|
||
|
|
|
||
|
|
int32_t last_ref = -9;
|
||
|
|
int32_t last_pos = -9;
|
||
|
|
for (i = 0; i < s->hdr->num_records; i++) {
|
||
|
|
if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) {
|
||
|
|
hts_log_error("CRAM file is not sorted by chromosome / position");
|
||
|
|
return -2;
|
||
|
|
}
|
||
|
|
last_ref = s->crecs[i].ref_id;
|
||
|
|
last_pos = s->crecs[i].apos;
|
||
|
|
|
||
|
|
if (s->crecs[i].ref_id == ref) {
|
||
|
|
if (ref_end < s->crecs[i].aend)
|
||
|
|
ref_end = s->crecs[i].aend;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ref != -2) {
|
||
|
|
snprintf(buf, sizeof(buf),
|
||
|
|
"%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
|
||
|
|
ref, ref_start, ref_end - ref_start + 1,
|
||
|
|
(int64_t)cpos, landmark, sz);
|
||
|
|
if (bgzf_write(fp, buf, strlen(buf)) < 0)
|
||
|
|
return -4;
|
||
|
|
}
|
||
|
|
|
||
|
|
ref = s->crecs[i].ref_id;
|
||
|
|
ref_start = s->crecs[i].apos;
|
||
|
|
ref_end = s->crecs[i].aend;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ref != -2) {
|
||
|
|
snprintf(buf, sizeof(buf),
|
||
|
|
"%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
|
||
|
|
ref, ref_start, ref_end - ref_start + 1,
|
||
|
|
(int64_t)cpos, landmark, sz);
|
||
|
|
if (bgzf_write(fp, buf, strlen(buf)) < 0)
|
||
|
|
return -4;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Adds a single slice to the index.
|
||
|
|
*/
|
||
|
|
int cram_index_slice(cram_fd *fd,
|
||
|
|
cram_container *c,
|
||
|
|
cram_slice *s,
|
||
|
|
BGZF *fp,
|
||
|
|
off_t cpos,
|
||
|
|
off_t spos, // relative to cpos
|
||
|
|
off_t sz) {
|
||
|
|
int ret;
|
||
|
|
char buf[1024];
|
||
|
|
|
||
|
|
if (sz > INT_MAX) {
|
||
|
|
hts_log_error("CRAM slice is too big (%"PRId64" bytes)",
|
||
|
|
(int64_t) sz);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (s->hdr->ref_seq_id == -2) {
|
||
|
|
ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz);
|
||
|
|
} else {
|
||
|
|
snprintf(buf, sizeof(buf),
|
||
|
|
"%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
|
||
|
|
s->hdr->ref_seq_id, s->hdr->ref_seq_start,
|
||
|
|
s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz);
|
||
|
|
ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
|
||
|
|
}
|
||
|
|
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Adds a single container to the index.
|
||
|
|
*/
|
||
|
|
static
|
||
|
|
int cram_index_container(cram_fd *fd,
|
||
|
|
cram_container *c,
|
||
|
|
BGZF *fp,
|
||
|
|
off_t cpos) {
|
||
|
|
int j;
|
||
|
|
off_t spos;
|
||
|
|
|
||
|
|
// 2.0 format
|
||
|
|
for (j = 0; j < c->num_landmarks; j++) {
|
||
|
|
cram_slice *s;
|
||
|
|
off_t sz;
|
||
|
|
int ret;
|
||
|
|
|
||
|
|
spos = htell(fd->fp);
|
||
|
|
if (spos - cpos - (off_t) c->offset != c->landmark[j]) {
|
||
|
|
hts_log_error("CRAM slice offset %"PRId64" does not match"
|
||
|
|
" landmark %d in container header (%"PRId32")",
|
||
|
|
(int64_t) (spos - cpos - (off_t) c->offset),
|
||
|
|
j, c->landmark[j]);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(s = cram_read_slice(fd))) {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
sz = htell(fd->fp) - spos;
|
||
|
|
ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz);
|
||
|
|
|
||
|
|
cram_free_slice(s);
|
||
|
|
|
||
|
|
if (ret < 0) {
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Builds an index file.
|
||
|
|
*
|
||
|
|
* fd is a newly opened cram file that we wish to index.
|
||
|
|
* fn_base is the filename of the associated CRAM file.
|
||
|
|
* fn_idx is the filename of the index file to be written;
|
||
|
|
* if NULL, we add ".crai" to fn_base to get the index filename.
|
||
|
|
*
|
||
|
|
* Returns 0 on success,
|
||
|
|
* negative on failure (-1 for read failure, -4 for write failure)
|
||
|
|
*/
|
||
|
|
int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
|
||
|
|
cram_container *c;
|
||
|
|
off_t cpos, hpos;
|
||
|
|
BGZF *fp;
|
||
|
|
kstring_t fn_idx_str = {0};
|
||
|
|
int64_t last_ref = -9, last_start = -9;
|
||
|
|
|
||
|
|
// Useful for cram_index_build_multiref
|
||
|
|
cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR);
|
||
|
|
|
||
|
|
if (! fn_idx) {
|
||
|
|
kputs(fn_base, &fn_idx_str);
|
||
|
|
kputs(".crai", &fn_idx_str);
|
||
|
|
fn_idx = fn_idx_str.s;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(fp = bgzf_open(fn_idx, "wg"))) {
|
||
|
|
perror(fn_idx);
|
||
|
|
free(fn_idx_str.s);
|
||
|
|
return -4;
|
||
|
|
}
|
||
|
|
|
||
|
|
free(fn_idx_str.s);
|
||
|
|
|
||
|
|
cpos = htell(fd->fp);
|
||
|
|
while ((c = cram_read_container(fd))) {
|
||
|
|
if (fd->err) {
|
||
|
|
perror("Cram container read");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
hpos = htell(fd->fp);
|
||
|
|
|
||
|
|
if (!(c->comp_hdr_block = cram_read_block(fd)))
|
||
|
|
return -1;
|
||
|
|
assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
|
||
|
|
|
||
|
|
c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
|
||
|
|
if (!c->comp_hdr)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) {
|
||
|
|
hts_log_error("CRAM file is not sorted by chromosome / position");
|
||
|
|
return -2;
|
||
|
|
}
|
||
|
|
last_ref = c->ref_seq_id;
|
||
|
|
last_start = c->ref_seq_start;
|
||
|
|
|
||
|
|
if (cram_index_container(fd, c, fp, cpos) < 0) {
|
||
|
|
bgzf_close(fp);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
off_t next_cpos = htell(fd->fp);
|
||
|
|
if (next_cpos != hpos + c->length) {
|
||
|
|
hts_log_error("Length %"PRId32" in container header at offset %lld does not match block lengths (%lld)",
|
||
|
|
c->length, (long long) cpos, (long long) next_cpos - hpos);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
cpos = next_cpos;
|
||
|
|
|
||
|
|
cram_free_container(c);
|
||
|
|
}
|
||
|
|
if (fd->err) {
|
||
|
|
bgzf_close(fp);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return (bgzf_close(fp) >= 0)? 0 : -4;
|
||
|
|
}
|
||
|
|
|
||
|
|
// internal recursive step
|
||
|
|
static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos,
|
||
|
|
int64_t nct,
|
||
|
|
off_t cstart, off_t cend,
|
||
|
|
int64_t *first, int64_t *last) {
|
||
|
|
int64_t nc = 0, i;
|
||
|
|
|
||
|
|
if (e->offset) {
|
||
|
|
if (e->offset != *last_pos) {
|
||
|
|
if (e->offset >= cstart && (!cend || e->offset <= cend)) {
|
||
|
|
if (first && *first < 0)
|
||
|
|
*first = nct;
|
||
|
|
if (last)
|
||
|
|
*last = nct;
|
||
|
|
}
|
||
|
|
nc++;
|
||
|
|
}
|
||
|
|
// else a new multi-ref in same container
|
||
|
|
*last_pos = e->offset;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (i = 0; i < e->nslice; i++)
|
||
|
|
nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct,
|
||
|
|
cstart, cend, first, last);
|
||
|
|
|
||
|
|
return nc;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*! Returns the number of containers in the CRAM file within given offsets.
|
||
|
|
*
|
||
|
|
* The cstart and cend offsets are the locations of the start of containers
|
||
|
|
* as returned by index_container_offset.
|
||
|
|
*
|
||
|
|
* If non-NULL, first and last will hold the inclusive range of container
|
||
|
|
* numbers, counting from zero.
|
||
|
|
*
|
||
|
|
* @return
|
||
|
|
* Returns the number of containers, equivalent to *last-*first+1.
|
||
|
|
*/
|
||
|
|
int64_t cram_num_containers_between(cram_fd *fd,
|
||
|
|
off_t cstart, off_t cend,
|
||
|
|
int64_t *first, int64_t *last) {
|
||
|
|
int64_t nc = 0, i;
|
||
|
|
int64_t last_pos = -99;
|
||
|
|
int64_t l_first = -1, l_last = -1;
|
||
|
|
|
||
|
|
for (i = 0; i < fd->index_sz; i++) {
|
||
|
|
int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
|
||
|
|
nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc,
|
||
|
|
cstart, cend, &l_first, &l_last);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (first)
|
||
|
|
*first = l_first;
|
||
|
|
if (last)
|
||
|
|
*last = l_last;
|
||
|
|
|
||
|
|
return l_last - l_first + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Queries the total number of distinct containers in the index.
|
||
|
|
* Note there may be more containers in the file than in the index, as we
|
||
|
|
* are not required to have an index entry for every one.
|
||
|
|
*/
|
||
|
|
int64_t cram_num_containers(cram_fd *fd) {
|
||
|
|
return cram_num_containers_between(fd, 0, 0, NULL, NULL);
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*! Returns the byte offset for the start of the n^th container.
|
||
|
|
*
|
||
|
|
* The index must have previously been loaded, otherwise <0 is returned.
|
||
|
|
*/
|
||
|
|
static cram_index *cram_container_num2offset_(cram_index *e, int num,
|
||
|
|
int64_t *last_pos, int *nc) {
|
||
|
|
if (e->offset) {
|
||
|
|
if (e->offset != *last_pos) {
|
||
|
|
if (*nc == num)
|
||
|
|
return e;
|
||
|
|
(*nc)++;
|
||
|
|
}
|
||
|
|
// else a new multi-ref in same container
|
||
|
|
*last_pos = e->offset;
|
||
|
|
}
|
||
|
|
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < e->nslice; i++) {
|
||
|
|
cram_index *tmp = cram_container_num2offset_(&e->e[i], num,
|
||
|
|
last_pos, nc);
|
||
|
|
if (tmp)
|
||
|
|
return tmp;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
off_t cram_container_num2offset(cram_fd *fd, int64_t num) {
|
||
|
|
int nc = 0, i;
|
||
|
|
int64_t last_pos = -9;
|
||
|
|
cram_index *e = NULL;
|
||
|
|
|
||
|
|
for (i = 0; i < fd->index_sz; i++) {
|
||
|
|
int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
|
||
|
|
if (!fd->index[j].nslice)
|
||
|
|
continue;
|
||
|
|
if ((e = cram_container_num2offset_(&fd->index[j], num,
|
||
|
|
&last_pos, &nc)))
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
return e ? e->offset : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*! Returns the container number for the first container at offset >= pos.
|
||
|
|
*
|
||
|
|
* The index must have previously been loaded, otherwise <0 is returned.
|
||
|
|
*/
|
||
|
|
static cram_index *cram_container_offset2num_(cram_index *e, off_t pos,
|
||
|
|
int64_t *last_pos, int *nc) {
|
||
|
|
if (e->offset) {
|
||
|
|
if (e->offset != *last_pos) {
|
||
|
|
if (e->offset >= pos)
|
||
|
|
return e;
|
||
|
|
(*nc)++;
|
||
|
|
}
|
||
|
|
// else a new multi-ref in same container
|
||
|
|
*last_pos = e->offset;
|
||
|
|
}
|
||
|
|
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < e->nslice; i++) {
|
||
|
|
cram_index *tmp = cram_container_offset2num_(&e->e[i], pos,
|
||
|
|
last_pos, nc);
|
||
|
|
if (tmp)
|
||
|
|
return tmp;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
int64_t cram_container_offset2num(cram_fd *fd, off_t pos) {
|
||
|
|
int nc = 0, i;
|
||
|
|
int64_t last_pos = -9;
|
||
|
|
cram_index *e = NULL;
|
||
|
|
|
||
|
|
for (i = 0; i < fd->index_sz; i++) {
|
||
|
|
int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
|
||
|
|
if (!fd->index[j].nslice)
|
||
|
|
continue;
|
||
|
|
if ((e = cram_container_offset2num_(&fd->index[j], pos,
|
||
|
|
&last_pos, &nc)))
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
return e ? nc : -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*!
|
||
|
|
* Returns the file offsets of CRAM containers covering a specific region
|
||
|
|
* query. Note both offsets are the START of the container.
|
||
|
|
*
|
||
|
|
* first will point to the start of the first overlapping container
|
||
|
|
* last will point to the start of the last overlapping container
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* <0 on failure
|
||
|
|
*/
|
||
|
|
int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
|
||
|
|
off_t *first, off_t *last) {
|
||
|
|
cram_index *ci;
|
||
|
|
|
||
|
|
if (first) {
|
||
|
|
if (!(ci = cram_index_query(fd, refid, start, NULL)))
|
||
|
|
return -1;
|
||
|
|
*first = ci->offset;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (last) {
|
||
|
|
if (!(ci = cram_index_query_last(fd, refid, end)))
|
||
|
|
return -1;
|
||
|
|
*last = ci->offset;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|