3588 lines
122 KiB
C
3588 lines
122 KiB
C
|
|
/*
|
||
|
|
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
|
||
|
|
Author: James Bonfield <jkb@sanger.ac.uk>
|
||
|
|
|
||
|
|
Redistribution and use in source and binary forms, with or without
|
||
|
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
|
||
|
|
1. Redistributions of source code must retain the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer.
|
||
|
|
|
||
|
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer in the documentation
|
||
|
|
and/or other materials provided with the distribution.
|
||
|
|
|
||
|
|
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
||
|
|
Institute nor the names of its contributors may be used to endorse or promote
|
||
|
|
products derived from this software without specific prior written permission.
|
||
|
|
|
||
|
|
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
||
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
|
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
||
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
* - In-memory decoding of CRAM data structures.
|
||
|
|
* - Iterator for reading CRAM record by record.
|
||
|
|
*/
|
||
|
|
|
||
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
||
|
|
#include <config.h>
|
||
|
|
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <errno.h>
|
||
|
|
#include <assert.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include <zlib.h>
|
||
|
|
#include <sys/types.h>
|
||
|
|
#include <sys/stat.h>
|
||
|
|
#include <math.h>
|
||
|
|
#include <stdint.h>
|
||
|
|
#include <inttypes.h>
|
||
|
|
|
||
|
|
#include "cram.h"
|
||
|
|
#include "os.h"
|
||
|
|
#include "../htslib/hts.h"
|
||
|
|
|
||
|
|
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
|
||
|
|
//#define USE_X
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------
|
||
|
|
* CRAM compression headers
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Decodes the Tag Dictionary record in the preservation map
|
||
|
|
* Updates the cram compression header.
|
||
|
|
*
|
||
|
|
* Returns number of bytes decoded on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
|
||
|
|
cram_block_compression_hdr *h) {
|
||
|
|
char *op = cp;
|
||
|
|
unsigned char *dat;
|
||
|
|
cram_block *b;
|
||
|
|
int32_t blk_size = 0;
|
||
|
|
int nTL, i, sz, err = 0;
|
||
|
|
|
||
|
|
if (!(b = cram_new_block(0, 0)))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (h->TD_blk || h->TL) {
|
||
|
|
hts_log_warning("More than one TD block found in compression header");
|
||
|
|
cram_free_block(h->TD_blk);
|
||
|
|
free(h->TL);
|
||
|
|
h->TD_blk = NULL;
|
||
|
|
h->TL = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Decode */
|
||
|
|
blk_size = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
if (!blk_size) {
|
||
|
|
h->nTL = 0;
|
||
|
|
cram_free_block(b);
|
||
|
|
return cp - op;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (err || blk_size < 0 || endp - cp < blk_size) {
|
||
|
|
cram_free_block(b);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
BLOCK_APPEND(b, cp, blk_size);
|
||
|
|
cp += blk_size;
|
||
|
|
sz = cp - op;
|
||
|
|
// Force nul termination if missing
|
||
|
|
if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
|
||
|
|
BLOCK_APPEND_CHAR(b, '\0');
|
||
|
|
|
||
|
|
/* Set up TL lookup table */
|
||
|
|
dat = BLOCK_DATA(b);
|
||
|
|
|
||
|
|
// Count
|
||
|
|
for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
|
||
|
|
nTL++;
|
||
|
|
while (dat[i])
|
||
|
|
i++;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Copy
|
||
|
|
if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
|
||
|
|
cram_free_block(b);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
|
||
|
|
h->TL[nTL++] = &dat[i];
|
||
|
|
while (dat[i])
|
||
|
|
i++;
|
||
|
|
}
|
||
|
|
h->TD_blk = b;
|
||
|
|
h->nTL = nTL;
|
||
|
|
|
||
|
|
return sz;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
cram_free_block(b);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Decodes a CRAM block compression header.
|
||
|
|
* Returns header ptr on success
|
||
|
|
* NULL on failure
|
||
|
|
*/
|
||
|
|
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
|
||
|
|
cram_block *b) {
|
||
|
|
char *cp, *endp, *cp_copy;
|
||
|
|
cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
|
||
|
|
int i, err = 0;
|
||
|
|
int32_t map_size = 0, map_count = 0;
|
||
|
|
|
||
|
|
if (!hdr)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
if (b->method != RAW) {
|
||
|
|
if (cram_uncompress_block(b)) {
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cp = (char *)b->data;
|
||
|
|
endp = cp + b->uncomp_size;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
|
||
|
|
hdr->ref_seq_span = fd->vv.varint_get64(&cp, endp, &err);
|
||
|
|
} else {
|
||
|
|
hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
hdr->ref_seq_span = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
}
|
||
|
|
hdr->num_records = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
if (hdr->num_landmarks < 0 ||
|
||
|
|
hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
|
||
|
|
endp - cp < hdr->num_landmarks) {
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
for (i = 0; i < hdr->num_landmarks; i++)
|
||
|
|
hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
|
||
|
|
}
|
||
|
|
|
||
|
|
hdr->preservation_map = kh_init(map);
|
||
|
|
|
||
|
|
memset(hdr->rec_encoding_map, 0,
|
||
|
|
CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
|
||
|
|
memset(hdr->tag_encoding_map, 0,
|
||
|
|
CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
|
||
|
|
|
||
|
|
if (!hdr->preservation_map) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Initialise defaults for preservation map */
|
||
|
|
hdr->read_names_included = 0;
|
||
|
|
hdr->AP_delta = 1;
|
||
|
|
hdr->qs_seq_orient = 1;
|
||
|
|
memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
|
||
|
|
|
||
|
|
/* Preservation map */
|
||
|
|
map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
|
||
|
|
map_count = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
for (i = 0; i < map_count; i++) {
|
||
|
|
pmap_t hd;
|
||
|
|
khint_t k;
|
||
|
|
int r;
|
||
|
|
|
||
|
|
if (endp - cp < 3) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
cp += 2;
|
||
|
|
switch(CRAM_KEY(cp[-2],cp[-1])) {
|
||
|
|
case CRAM_KEY('M','I'): // was mapped QS included in V1.0
|
||
|
|
case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
|
||
|
|
case CRAM_KEY('P','I'): // was unmapped placed in V1.0
|
||
|
|
hd.i = *cp++;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('R','N'):
|
||
|
|
hd.i = *cp++;
|
||
|
|
k = kh_put(map, hdr->preservation_map, "RN", &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
kh_val(hdr->preservation_map, k) = hd;
|
||
|
|
hdr->read_names_included = hd.i;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('A','P'):
|
||
|
|
hd.i = *cp++;
|
||
|
|
k = kh_put(map, hdr->preservation_map, "AP", &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
kh_val(hdr->preservation_map, k) = hd;
|
||
|
|
hdr->AP_delta = hd.i;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('R','R'):
|
||
|
|
hd.i = *cp++;
|
||
|
|
k = kh_put(map, hdr->preservation_map, "RR", &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
kh_val(hdr->preservation_map, k) = hd;
|
||
|
|
hdr->no_ref = !hd.i;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('Q','O'):
|
||
|
|
hd.i = *cp++;
|
||
|
|
k = kh_put(map, hdr->preservation_map, "QO", &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
kh_val(hdr->preservation_map, k) = hd;
|
||
|
|
hdr->qs_seq_orient = hd.i;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('S','M'):
|
||
|
|
if (endp - cp < 5) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
|
||
|
|
hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
|
||
|
|
hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
|
||
|
|
hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
|
||
|
|
|
||
|
|
hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
|
||
|
|
hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
|
||
|
|
hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
|
||
|
|
hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
|
||
|
|
|
||
|
|
hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
|
||
|
|
hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
|
||
|
|
hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
|
||
|
|
hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
|
||
|
|
|
||
|
|
hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
|
||
|
|
hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
|
||
|
|
hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
|
||
|
|
hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
|
||
|
|
|
||
|
|
hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
|
||
|
|
hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
|
||
|
|
hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
|
||
|
|
hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
|
||
|
|
|
||
|
|
hd.p = cp;
|
||
|
|
cp += 5;
|
||
|
|
|
||
|
|
k = kh_put(map, hdr->preservation_map, "SM", &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
kh_val(hdr->preservation_map, k) = hd;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('T','D'): {
|
||
|
|
int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
|
||
|
|
if (sz < 0) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
hd.p = cp;
|
||
|
|
cp += sz;
|
||
|
|
|
||
|
|
k = kh_put(map, hdr->preservation_map, "TD", &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
kh_val(hdr->preservation_map, k) = hd;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
|
||
|
|
// guess byte;
|
||
|
|
cp++;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (cp - cp_copy != map_size) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Record encoding map */
|
||
|
|
map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
|
||
|
|
map_count = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
|
||
|
|
for (i = 0; i < map_count; i++) {
|
||
|
|
char *key = cp;
|
||
|
|
int32_t encoding = E_NULL;
|
||
|
|
int32_t size = 0;
|
||
|
|
ptrdiff_t offset;
|
||
|
|
cram_map *m;
|
||
|
|
enum cram_DS_ID ds_id;
|
||
|
|
enum cram_external_type type;
|
||
|
|
|
||
|
|
if (endp - cp < 4) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
cp += 2;
|
||
|
|
encoding = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
size = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
|
||
|
|
offset = cp - (char *)b->data;
|
||
|
|
|
||
|
|
if (encoding == E_NULL)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (size < 0 || endp - cp < size) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
//printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* For CRAM1.0 CF and BF are Byte and not Int.
|
||
|
|
* Practically speaking it makes no difference unless we have a
|
||
|
|
* 1.0 format file that stores these in EXTERNAL as only then
|
||
|
|
* does Byte vs Int matter.
|
||
|
|
*
|
||
|
|
* Neither this C code nor Java reference implementations did this,
|
||
|
|
* so we gloss over it and treat them as int.
|
||
|
|
*/
|
||
|
|
ds_id = DS_CORE;
|
||
|
|
if (key[0] == 'B' && key[1] == 'F') {
|
||
|
|
ds_id = DS_BF; type = E_INT;
|
||
|
|
} else if (key[0] == 'C' && key[1] == 'F') {
|
||
|
|
ds_id = DS_CF; type = E_INT;
|
||
|
|
} else if (key[0] == 'R' && key[1] == 'I') {
|
||
|
|
ds_id = DS_RI; type = E_INT;
|
||
|
|
} else if (key[0] == 'R' && key[1] == 'L') {
|
||
|
|
ds_id = DS_RL; type = E_INT;
|
||
|
|
} else if (key[0] == 'A' && key[1] == 'P') {
|
||
|
|
ds_id = DS_AP;
|
||
|
|
type = is_v4 ? E_SLONG : E_INT;
|
||
|
|
} else if (key[0] == 'R' && key[1] == 'G') {
|
||
|
|
ds_id = DS_RG;
|
||
|
|
type = E_INT;
|
||
|
|
} else if (key[0] == 'M' && key[1] == 'F') {
|
||
|
|
ds_id = DS_MF; type = E_INT;
|
||
|
|
} else if (key[0] == 'N' && key[1] == 'S') {
|
||
|
|
ds_id = DS_NS; type = E_INT;
|
||
|
|
} else if (key[0] == 'N' && key[1] == 'P') {
|
||
|
|
ds_id = DS_NP;
|
||
|
|
type = is_v4 ? E_LONG : E_INT;
|
||
|
|
} else if (key[0] == 'T' && key[1] == 'S') {
|
||
|
|
ds_id = DS_TS;
|
||
|
|
type = is_v4 ? E_SLONG : E_INT;
|
||
|
|
} else if (key[0] == 'N' && key[1] == 'F') {
|
||
|
|
ds_id = DS_NF; type = E_INT;
|
||
|
|
} else if (key[0] == 'T' && key[1] == 'C') {
|
||
|
|
ds_id = DS_TC; type = E_BYTE;
|
||
|
|
} else if (key[0] == 'T' && key[1] == 'N') {
|
||
|
|
ds_id = DS_TN; type = E_INT;
|
||
|
|
} else if (key[0] == 'F' && key[1] == 'N') {
|
||
|
|
ds_id = DS_FN; type = E_INT;
|
||
|
|
} else if (key[0] == 'F' && key[1] == 'C') {
|
||
|
|
ds_id = DS_FC; type = E_BYTE;
|
||
|
|
} else if (key[0] == 'F' && key[1] == 'P') {
|
||
|
|
ds_id = DS_FP; type = E_INT;
|
||
|
|
} else if (key[0] == 'B' && key[1] == 'S') {
|
||
|
|
ds_id = DS_BS; type = E_BYTE;
|
||
|
|
} else if (key[0] == 'I' && key[1] == 'N') {
|
||
|
|
ds_id = DS_IN; type = E_BYTE_ARRAY;
|
||
|
|
} else if (key[0] == 'S' && key[1] == 'C') {
|
||
|
|
ds_id = DS_SC; type = E_BYTE_ARRAY;
|
||
|
|
} else if (key[0] == 'D' && key[1] == 'L') {
|
||
|
|
ds_id = DS_DL; type = E_INT;
|
||
|
|
} else if (key[0] == 'B' && key[1] == 'A') {
|
||
|
|
ds_id = DS_BA; type = E_BYTE;
|
||
|
|
} else if (key[0] == 'B' && key[1] == 'B') {
|
||
|
|
ds_id = DS_BB; type = E_BYTE_ARRAY;
|
||
|
|
} else if (key[0] == 'R' && key[1] == 'S') {
|
||
|
|
ds_id = DS_RS; type = E_INT;
|
||
|
|
} else if (key[0] == 'P' && key[1] == 'D') {
|
||
|
|
ds_id = DS_PD; type = E_INT;
|
||
|
|
} else if (key[0] == 'H' && key[1] == 'C') {
|
||
|
|
ds_id = DS_HC; type = E_INT;
|
||
|
|
} else if (key[0] == 'M' && key[1] == 'Q') {
|
||
|
|
ds_id = DS_MQ; type = E_INT;
|
||
|
|
} else if (key[0] == 'R' && key[1] == 'N') {
|
||
|
|
ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
|
||
|
|
} else if (key[0] == 'Q' && key[1] == 'S') {
|
||
|
|
ds_id = DS_QS; type = E_BYTE;
|
||
|
|
} else if (key[0] == 'Q' && key[1] == 'Q') {
|
||
|
|
ds_id = DS_QQ; type = E_BYTE_ARRAY;
|
||
|
|
} else if (key[0] == 'T' && key[1] == 'L') {
|
||
|
|
ds_id = DS_TL; type = E_INT;
|
||
|
|
} else if (key[0] == 'T' && key[1] == 'M') {
|
||
|
|
} else if (key[0] == 'T' && key[1] == 'V') {
|
||
|
|
} else {
|
||
|
|
hts_log_warning("Unrecognised key: %.2s", key);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds_id != DS_CORE) {
|
||
|
|
if (hdr->codecs[ds_id] != NULL) {
|
||
|
|
hts_log_warning("Codec for key %.2s defined more than once",
|
||
|
|
key);
|
||
|
|
hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
|
||
|
|
}
|
||
|
|
hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
|
||
|
|
type, fd->version, &fd->vv);
|
||
|
|
if (!hdr->codecs[ds_id]) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cp += size;
|
||
|
|
|
||
|
|
// Fill out cram_map purely for cram_dump to dump out.
|
||
|
|
m = malloc(sizeof(*m));
|
||
|
|
if (!m) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
m->key = CRAM_KEY(key[0], key[1]);
|
||
|
|
m->encoding = encoding;
|
||
|
|
m->size = size;
|
||
|
|
m->offset = offset;
|
||
|
|
m->codec = NULL;
|
||
|
|
|
||
|
|
m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
|
||
|
|
hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
|
||
|
|
}
|
||
|
|
if (cp - cp_copy != map_size) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Tag encoding map */
|
||
|
|
map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
|
||
|
|
map_count = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
for (i = 0; i < map_count; i++) {
|
||
|
|
int32_t encoding = E_NULL;
|
||
|
|
int32_t size = 0;
|
||
|
|
cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
|
||
|
|
uint8_t key[3];
|
||
|
|
|
||
|
|
if (!m || endp - cp < 6) {
|
||
|
|
free(m);
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
m->key = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
key[0] = m->key>>16;
|
||
|
|
key[1] = m->key>>8;
|
||
|
|
key[2] = m->key;
|
||
|
|
encoding = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
size = fd->vv.varint_get32(&cp, endp, &err);
|
||
|
|
|
||
|
|
m->encoding = encoding;
|
||
|
|
m->size = size;
|
||
|
|
m->offset = cp - (char *)b->data;
|
||
|
|
if (size < 0 || endp - cp < size ||
|
||
|
|
!(m->codec = cram_decoder_init(hdr, encoding, cp, size,
|
||
|
|
E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
free(m);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
cp += size;
|
||
|
|
|
||
|
|
m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
|
||
|
|
hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
|
||
|
|
}
|
||
|
|
if (err || cp - cp_copy != map_size) {
|
||
|
|
cram_free_compression_header(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
return hdr;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Note we also need to scan through the record encoding map to
|
||
|
|
* see which data series share the same block, either external or
|
||
|
|
* CORE. For example if we need the BF data series but MQ and CF
|
||
|
|
* are also encoded in the same block then we need to add those in
|
||
|
|
* as a dependency in order to correctly decode BF.
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
int cram_dependent_data_series(cram_fd *fd,
|
||
|
|
cram_block_compression_hdr *hdr,
|
||
|
|
cram_slice *s) {
|
||
|
|
int *block_used;
|
||
|
|
int core_used = 0;
|
||
|
|
int i;
|
||
|
|
static int i_to_id[] = {
|
||
|
|
DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
|
||
|
|
DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
|
||
|
|
DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
|
||
|
|
DS_HC, DS_SC, DS_BB, DS_QQ,
|
||
|
|
};
|
||
|
|
uint32_t orig_ds;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Set the data_series bit field based on fd->required_fields
|
||
|
|
* contents.
|
||
|
|
*/
|
||
|
|
if (fd->required_fields && fd->required_fields != INT_MAX) {
|
||
|
|
s->data_series = 0;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_QNAME)
|
||
|
|
s->data_series |= CRAM_RN;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_FLAG)
|
||
|
|
s->data_series |= CRAM_BF;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_RNAME)
|
||
|
|
s->data_series |= CRAM_RI | CRAM_BF;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_POS)
|
||
|
|
s->data_series |= CRAM_AP | CRAM_BF;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_MAPQ)
|
||
|
|
s->data_series |= CRAM_MQ;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_CIGAR)
|
||
|
|
s->data_series |= CRAM_CIGAR;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_RNEXT)
|
||
|
|
s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_PNEXT)
|
||
|
|
s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_TLEN)
|
||
|
|
s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
|
||
|
|
CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_SEQ)
|
||
|
|
s->data_series |= CRAM_SEQ;
|
||
|
|
|
||
|
|
if (!(fd->required_fields & SAM_AUX))
|
||
|
|
// No easy way to get MD/NM without other tags at present
|
||
|
|
s->decode_md = 0;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_QUAL)
|
||
|
|
s->data_series |= CRAM_QUAL;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_AUX)
|
||
|
|
s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_RGAUX)
|
||
|
|
s->data_series |= CRAM_RG | CRAM_BF;
|
||
|
|
|
||
|
|
// Always uncompress CORE block
|
||
|
|
if (cram_uncompress_block(s->block[0]))
|
||
|
|
return -1;
|
||
|
|
} else {
|
||
|
|
s->data_series = CRAM_ALL;
|
||
|
|
|
||
|
|
for (i = 0; i < s->hdr->num_blocks; i++) {
|
||
|
|
if (cram_uncompress_block(s->block[i]))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
|
||
|
|
if (!block_used)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
do {
|
||
|
|
/*
|
||
|
|
* Also set data_series based on code prerequisites. Eg if we need
|
||
|
|
* CRAM_QS then we also need to know CRAM_RL so we know how long it
|
||
|
|
* is, or if we need FC/FP then we also need FN (number of features).
|
||
|
|
*
|
||
|
|
* It's not reciprocal though. We may be needing to decode FN
|
||
|
|
* but have no need to decode FC, FP and cigar ops.
|
||
|
|
*/
|
||
|
|
if (s->data_series & CRAM_RS) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_PD) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_HC) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_QS) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_IN) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_SC) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_BS) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_DL) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_BA) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_BB) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
if (s->data_series & CRAM_QQ) s->data_series |= CRAM_FC|CRAM_FP;
|
||
|
|
|
||
|
|
// cram_decode_seq() needs seq[] array
|
||
|
|
if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
|
||
|
|
|
||
|
|
if (s->data_series & CRAM_FP) s->data_series |= CRAM_FC;
|
||
|
|
if (s->data_series & CRAM_FC) s->data_series |= CRAM_FN;
|
||
|
|
if (s->data_series & CRAM_aux) s->data_series |= CRAM_TL;
|
||
|
|
if (s->data_series & CRAM_MF) s->data_series |= CRAM_CF;
|
||
|
|
if (s->data_series & CRAM_MQ) s->data_series |= CRAM_BF;
|
||
|
|
if (s->data_series & CRAM_BS) s->data_series |= CRAM_RI;
|
||
|
|
if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
|
||
|
|
s->data_series |= CRAM_CF;
|
||
|
|
if (!hdr->read_names_included && s->data_series & CRAM_RN)
|
||
|
|
s->data_series |= CRAM_CF | CRAM_NF;
|
||
|
|
if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
|
||
|
|
s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
|
||
|
|
if (s->data_series & CRAM_FN) {
|
||
|
|
// The CRAM_FN loop checks for reference length boundaries,
|
||
|
|
// which needs a working seq_pos. Some fields are fixed size
|
||
|
|
// irrespective of if we decode (BS), but others need to know
|
||
|
|
// the size of the string fetched back (SC, IN, BB).
|
||
|
|
s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
|
||
|
|
}
|
||
|
|
|
||
|
|
orig_ds = s->data_series;
|
||
|
|
|
||
|
|
// Find which blocks are in use.
|
||
|
|
for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
|
||
|
|
int bnum1, bnum2, j;
|
||
|
|
cram_codec *c = hdr->codecs[i_to_id[i]];
|
||
|
|
|
||
|
|
if (!(s->data_series & (1<<i)))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (!c)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
bnum1 = cram_codec_to_id(c, &bnum2);
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
switch (bnum1) {
|
||
|
|
case -2:
|
||
|
|
break;
|
||
|
|
|
||
|
|
case -1:
|
||
|
|
core_used = 1;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
for (j = 0; j < s->hdr->num_blocks; j++) {
|
||
|
|
if (s->block[j]->content_type == EXTERNAL &&
|
||
|
|
s->block[j]->content_id == bnum1) {
|
||
|
|
block_used[j] = 1;
|
||
|
|
if (cram_uncompress_block(s->block[j])) {
|
||
|
|
free(block_used);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (bnum2 == -2 || bnum1 == bnum2)
|
||
|
|
break;
|
||
|
|
|
||
|
|
bnum1 = bnum2; // 2nd pass
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Tags too
|
||
|
|
if ((fd->required_fields & SAM_AUX) ||
|
||
|
|
(s->data_series & CRAM_aux)) {
|
||
|
|
for (i = 0; i < CRAM_MAP_HASH; i++) {
|
||
|
|
int bnum1, bnum2, j;
|
||
|
|
cram_map *m = hdr->tag_encoding_map[i];
|
||
|
|
|
||
|
|
while (m) {
|
||
|
|
cram_codec *c = m->codec;
|
||
|
|
if (!c) {
|
||
|
|
m = m->next;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
bnum1 = cram_codec_to_id(c, &bnum2);
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
switch (bnum1) {
|
||
|
|
case -2:
|
||
|
|
break;
|
||
|
|
|
||
|
|
case -1:
|
||
|
|
core_used = 1;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
for (j = 0; j < s->hdr->num_blocks; j++) {
|
||
|
|
if (s->block[j]->content_type == EXTERNAL &&
|
||
|
|
s->block[j]->content_id == bnum1) {
|
||
|
|
block_used[j] = 1;
|
||
|
|
if (cram_uncompress_block(s->block[j])) {
|
||
|
|
free(block_used);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (bnum2 == -2 || bnum1 == bnum2)
|
||
|
|
break;
|
||
|
|
|
||
|
|
bnum1 = bnum2; // 2nd pass
|
||
|
|
}
|
||
|
|
|
||
|
|
m = m->next;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// We now know which blocks are in used, so repeat and find
|
||
|
|
// which other data series need to be added.
|
||
|
|
for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
|
||
|
|
int bnum1, bnum2, j;
|
||
|
|
cram_codec *c = hdr->codecs[i_to_id[i]];
|
||
|
|
|
||
|
|
if (!c)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
bnum1 = cram_codec_to_id(c, &bnum2);
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
switch (bnum1) {
|
||
|
|
case -2:
|
||
|
|
break;
|
||
|
|
|
||
|
|
case -1:
|
||
|
|
if (core_used) {
|
||
|
|
//printf(" + data series %08x:\n", 1<<i);
|
||
|
|
s->data_series |= 1<<i;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
for (j = 0; j < s->hdr->num_blocks; j++) {
|
||
|
|
if (s->block[j]->content_type == EXTERNAL &&
|
||
|
|
s->block[j]->content_id == bnum1) {
|
||
|
|
if (block_used[j]) {
|
||
|
|
//printf(" + data series %08x:\n", 1<<i);
|
||
|
|
s->data_series |= 1<<i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (bnum2 == -2 || bnum1 == bnum2)
|
||
|
|
break;
|
||
|
|
|
||
|
|
bnum1 = bnum2; // 2nd pass
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Tags too
|
||
|
|
for (i = 0; i < CRAM_MAP_HASH; i++) {
|
||
|
|
int bnum1, bnum2, j;
|
||
|
|
cram_map *m = hdr->tag_encoding_map[i];
|
||
|
|
|
||
|
|
while (m) {
|
||
|
|
cram_codec *c = m->codec;
|
||
|
|
if (!c) {
|
||
|
|
m = m->next;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
bnum1 = cram_codec_to_id(c, &bnum2);
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
switch (bnum1) {
|
||
|
|
case -2:
|
||
|
|
break;
|
||
|
|
|
||
|
|
case -1:
|
||
|
|
//printf(" + data series %08x:\n", CRAM_aux);
|
||
|
|
s->data_series |= CRAM_aux;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
for (j = 0; j < s->hdr->num_blocks; j++) {
|
||
|
|
if (s->block[j]->content_type == EXTERNAL &&
|
||
|
|
s->block[j]->content_id == bnum1) {
|
||
|
|
if (block_used[j]) {
|
||
|
|
//printf(" + data series %08x:\n",
|
||
|
|
// CRAM_aux);
|
||
|
|
s->data_series |= CRAM_aux;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (bnum2 == -2 || bnum1 == bnum2)
|
||
|
|
break;
|
||
|
|
|
||
|
|
bnum1 = bnum2; // 2nd pass
|
||
|
|
}
|
||
|
|
|
||
|
|
m = m->next;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} while (orig_ds != s->data_series);
|
||
|
|
|
||
|
|
free(block_used);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Checks whether an external block is used solely by a single data series.
|
||
|
|
* Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
|
||
|
|
* or 0 if not (E_NULL).
|
||
|
|
*/
|
||
|
|
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
|
||
|
|
int id) {
|
||
|
|
int i, n_id = 0;
|
||
|
|
enum cram_encoding e_type = 0;
|
||
|
|
|
||
|
|
for (i = 0; i < DS_END; i++) {
|
||
|
|
cram_codec *c;
|
||
|
|
int bnum1, bnum2, old_n_id;
|
||
|
|
|
||
|
|
if (!(c = hdr->codecs[i]))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
bnum1 = cram_codec_to_id(c, &bnum2);
|
||
|
|
|
||
|
|
old_n_id = n_id;
|
||
|
|
if (bnum1 == id) {
|
||
|
|
n_id++;
|
||
|
|
e_type = c->codec;
|
||
|
|
}
|
||
|
|
if (bnum2 == id) {
|
||
|
|
n_id++;
|
||
|
|
e_type = c->codec;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (n_id == old_n_id+2)
|
||
|
|
n_id--; // len/val in same place counts once only.
|
||
|
|
}
|
||
|
|
|
||
|
|
return n_id == 1 ? e_type : 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Attempts to estimate the size of some blocks so we can preallocate them
|
||
|
|
* before decoding. Although decoding will automatically grow the blocks,
|
||
|
|
* it is typically more efficient to preallocate.
|
||
|
|
*/
|
||
|
|
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
|
||
|
|
int *qual_size, int *name_size,
|
||
|
|
int *q_id) {
|
||
|
|
int bnum1, bnum2;
|
||
|
|
cram_codec *cd;
|
||
|
|
|
||
|
|
*qual_size = 0;
|
||
|
|
*name_size = 0;
|
||
|
|
|
||
|
|
/* Qual */
|
||
|
|
cd = hdr->codecs[DS_QS];
|
||
|
|
if (cd == NULL) return;
|
||
|
|
bnum1 = cram_codec_to_id(cd, &bnum2);
|
||
|
|
if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
|
||
|
|
if (cram_ds_unique(hdr, cd, bnum1)) {
|
||
|
|
cram_block *b = cram_get_block_by_id(s, bnum1);
|
||
|
|
if (b) *qual_size = b->uncomp_size;
|
||
|
|
if (q_id && cd->codec == E_EXTERNAL)
|
||
|
|
*q_id = bnum1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Name */
|
||
|
|
cd = hdr->codecs[DS_RN];
|
||
|
|
if (cd == NULL) return;
|
||
|
|
bnum1 = cram_codec_to_id(cd, &bnum2);
|
||
|
|
if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
|
||
|
|
if (cram_ds_unique(hdr, cd, bnum1)) {
|
||
|
|
cram_block *b = cram_get_block_by_id(s, bnum1);
|
||
|
|
if (b) *name_size = b->uncomp_size;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------
|
||
|
|
* CRAM slices
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Decodes a CRAM (un)mapped slice header block.
|
||
|
|
* Returns slice header ptr on success
|
||
|
|
* NULL on failure
|
||
|
|
*/
|
||
|
|
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
|
||
|
|
cram_block_slice_hdr *hdr;
|
||
|
|
unsigned char *cp;
|
||
|
|
unsigned char *cp_end;
|
||
|
|
int i, err = 0;
|
||
|
|
|
||
|
|
if (b->method != RAW) {
|
||
|
|
/* Spec. says slice header should be RAW, but we can future-proof
|
||
|
|
by trying to decode it if it isn't. */
|
||
|
|
if (cram_uncompress_block(b) < 0)
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
cp = (unsigned char *)BLOCK_DATA(b);
|
||
|
|
cp_end = cp + b->uncomp_size;
|
||
|
|
|
||
|
|
if (b->content_type != MAPPED_SLICE &&
|
||
|
|
b->content_type != UNMAPPED_SLICE)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
if (!(hdr = calloc(1, sizeof(*hdr))))
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
hdr->content_type = b->content_type;
|
||
|
|
|
||
|
|
if (b->content_type == MAPPED_SLICE) {
|
||
|
|
hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
|
||
|
|
hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
|
||
|
|
} else {
|
||
|
|
hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
|
||
|
|
hdr->ref_seq_span = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
|
||
|
|
}
|
||
|
|
if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
|
||
|
|
free(hdr);
|
||
|
|
hts_log_error("Negative values not permitted for header "
|
||
|
|
"sequence start or span fields");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
|
||
|
|
hdr->record_counter = 0;
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 2) {
|
||
|
|
hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
|
||
|
|
} else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
||
|
|
hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
|
||
|
|
}
|
||
|
|
hdr->num_blocks = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
|
||
|
|
hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
|
||
|
|
if (hdr->num_content_ids < 1 ||
|
||
|
|
hdr->num_content_ids >= 10000) {
|
||
|
|
// Slice must have at least one data block, and there is no need
|
||
|
|
// for more than 2 per possible aux-tag plus ancillary.
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
|
||
|
|
if (!hdr->block_content_ids) {
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (i = 0; i < hdr->num_content_ids; i++)
|
||
|
|
hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
|
||
|
|
(char *)cp_end,
|
||
|
|
&err);
|
||
|
|
if (err) {
|
||
|
|
free(hdr->block_content_ids);
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (b->content_type == MAPPED_SLICE)
|
||
|
|
hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1) {
|
||
|
|
if (cp_end - cp < 16) {
|
||
|
|
free(hdr->block_content_ids);
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
memcpy(hdr->md5, cp, 16);
|
||
|
|
} else {
|
||
|
|
memset(hdr->md5, 0, 16);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!err)
|
||
|
|
return hdr;
|
||
|
|
|
||
|
|
free(hdr->block_content_ids);
|
||
|
|
free(hdr);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
#if 0
|
||
|
|
/* Returns the number of bits set in val; it the highest bit used */
|
||
|
|
static int nbits(int v) {
|
||
|
|
static const int MultiplyDeBruijnBitPosition[32] = {
|
||
|
|
1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
|
||
|
|
9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
|
||
|
|
};
|
||
|
|
|
||
|
|
v |= v >> 1; // first up to set all bits 1 after the first 1 */
|
||
|
|
v |= v >> 2;
|
||
|
|
v |= v >> 4;
|
||
|
|
v |= v >> 8;
|
||
|
|
v |= v >> 16;
|
||
|
|
|
||
|
|
// DeBruijn magic to find top bit
|
||
|
|
return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
|
||
|
|
#if 0
|
||
|
|
static int sort_freqs(const void *vp1, const void *vp2) {
|
||
|
|
const int i1 = *(const int *)vp1;
|
||
|
|
const int i2 = *(const int *)vp2;
|
||
|
|
return i1-i2;
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------
|
||
|
|
* Primary CRAM sequence decoder
|
||
|
|
*/
|
||
|
|
|
||
|
|
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
|
||
|
|
if (decode_md) {
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk, c);
|
||
|
|
*md_dist = 0;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Internal part of cram_decode_slice().
|
||
|
|
* Generates the sequence, quality and cigar components.
|
||
|
|
*/
|
||
|
|
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
|
||
|
|
cram_block *blk, cram_record *cr, sam_hdr_t *sh,
|
||
|
|
int cf, char *seq, char *qual,
|
||
|
|
int has_MD, int has_NM) {
|
||
|
|
int prev_pos = 0, f, r = 0, out_sz = 1;
|
||
|
|
int seq_pos = 1;
|
||
|
|
int cig_len = 0;
|
||
|
|
int64_t ref_pos = cr->apos;
|
||
|
|
int32_t fn, i32;
|
||
|
|
enum cigar_op cig_op = BAM_CMATCH;
|
||
|
|
uint32_t *cigar = s->cigar;
|
||
|
|
uint32_t ncigar = s->ncigar;
|
||
|
|
uint32_t cigar_alloc = s->cigar_alloc;
|
||
|
|
uint32_t nm = 0;
|
||
|
|
int32_t md_dist = 0;
|
||
|
|
int orig_aux = 0;
|
||
|
|
// CRAM < 4.0 decode_md is off/on
|
||
|
|
// CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
|
||
|
|
int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
|
||
|
|
? (s->decode_md > 0)
|
||
|
|
: (s->decode_md != 0);
|
||
|
|
int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
|
||
|
|
int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
|
||
|
|
uint32_t ds = s->data_series;
|
||
|
|
sam_hrecs_t *bfd = sh->hrecs;
|
||
|
|
|
||
|
|
cram_codec **codecs = c->comp_hdr->codecs;
|
||
|
|
|
||
|
|
if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
||
|
|
memset(qual, 255, cr->len);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
|
||
|
|
decode_md = decode_nm = 0;
|
||
|
|
|
||
|
|
if (decode_md) {
|
||
|
|
orig_aux = BLOCK_SIZE(s->aux_blk);
|
||
|
|
if (has_MD == 0)
|
||
|
|
BLOCK_APPEND(s->aux_blk, "MDZ", 3);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_FN) {
|
||
|
|
if (!codecs[DS_FN]) return -1;
|
||
|
|
r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
|
||
|
|
blk, (char *)&fn, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
} else {
|
||
|
|
fn = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
ref_pos--; // count from 0
|
||
|
|
cr->cigar = ncigar;
|
||
|
|
|
||
|
|
if (!(ds & (CRAM_FC | CRAM_FP)))
|
||
|
|
goto skip_cigar;
|
||
|
|
|
||
|
|
if (fn) {
|
||
|
|
if ((ds & CRAM_FC) && !codecs[DS_FC])
|
||
|
|
return -1;
|
||
|
|
if ((ds & CRAM_FP) && !codecs[DS_FP])
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (f = 0; f < fn; f++) {
|
||
|
|
int32_t pos = 0;
|
||
|
|
char op;
|
||
|
|
|
||
|
|
if (ncigar+2 >= cigar_alloc) {
|
||
|
|
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
|
||
|
|
if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
|
||
|
|
return -1;
|
||
|
|
s->cigar = cigar;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_FC) {
|
||
|
|
r |= codecs[DS_FC]->decode(s,
|
||
|
|
codecs[DS_FC],
|
||
|
|
blk,
|
||
|
|
&op, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(ds & CRAM_FP))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
r |= codecs[DS_FP]->decode(s,
|
||
|
|
codecs[DS_FP],
|
||
|
|
blk,
|
||
|
|
(char *)&pos, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
pos += prev_pos;
|
||
|
|
|
||
|
|
if (pos <= 0) {
|
||
|
|
hts_log_error("Feature position %d before start of read", pos);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (pos > seq_pos) {
|
||
|
|
if (pos > cr->len+1)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (s->ref && cr->ref_id >= 0) {
|
||
|
|
if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
|
||
|
|
static int whinged = 0;
|
||
|
|
int rlen;
|
||
|
|
if (!whinged)
|
||
|
|
hts_log_warning("Ref pos outside of ref sequence boundary");
|
||
|
|
whinged = 1;
|
||
|
|
rlen = bfd->ref[cr->ref_id].len - ref_pos;
|
||
|
|
// May miss MD/NM cases where both seq/ref are N, but this is a
|
||
|
|
// malformed cram file anyway.
|
||
|
|
if (rlen > 0) {
|
||
|
|
if (ref_pos + rlen > s->ref_end)
|
||
|
|
goto beyond_slice;
|
||
|
|
|
||
|
|
memcpy(&seq[seq_pos-1],
|
||
|
|
&s->ref[ref_pos - s->ref_start +1], rlen);
|
||
|
|
if ((pos - seq_pos) - rlen > 0)
|
||
|
|
memset(&seq[seq_pos-1+rlen], 'N',
|
||
|
|
(pos - seq_pos) - rlen);
|
||
|
|
} else {
|
||
|
|
memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
|
||
|
|
}
|
||
|
|
if (md_dist >= 0)
|
||
|
|
md_dist += pos - seq_pos;
|
||
|
|
} else {
|
||
|
|
// 'N' in both ref and seq is also mismatch for NM/MD
|
||
|
|
if (ref_pos + pos-seq_pos > s->ref_end)
|
||
|
|
goto beyond_slice;
|
||
|
|
|
||
|
|
const char *refp = s->ref + ref_pos - s->ref_start + 1;
|
||
|
|
const int frag_len = pos - seq_pos;
|
||
|
|
int do_cpy = 1;
|
||
|
|
if (decode_md || decode_nm) {
|
||
|
|
char *N = memchr(refp, 'N', frag_len);
|
||
|
|
if (N) {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < frag_len; i++) {
|
||
|
|
char base = refp[i];
|
||
|
|
if (base == 'N') {
|
||
|
|
if (add_md_char(s, decode_md,
|
||
|
|
'N', &md_dist) < 0)
|
||
|
|
return -1;
|
||
|
|
nm++;
|
||
|
|
} else {
|
||
|
|
md_dist++;
|
||
|
|
}
|
||
|
|
seq[seq_pos-1+i] = base;
|
||
|
|
}
|
||
|
|
do_cpy = 0;
|
||
|
|
} else {
|
||
|
|
md_dist += frag_len;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (do_cpy)
|
||
|
|
memcpy(&seq[seq_pos-1], refp, frag_len);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#ifdef USE_X
|
||
|
|
if (cig_len && cig_op != BAM_CBASE_MATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
cig_op = BAM_CBASE_MATCH;
|
||
|
|
#else
|
||
|
|
if (cig_len && cig_op != BAM_CMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
cig_op = BAM_CMATCH;
|
||
|
|
#endif
|
||
|
|
cig_len += pos - seq_pos;
|
||
|
|
ref_pos += pos - seq_pos;
|
||
|
|
seq_pos = pos;
|
||
|
|
}
|
||
|
|
|
||
|
|
prev_pos = pos;
|
||
|
|
|
||
|
|
if (!(ds & CRAM_FC))
|
||
|
|
goto skip_cigar;
|
||
|
|
|
||
|
|
switch(op) {
|
||
|
|
case 'S': { // soft clip: IN
|
||
|
|
int32_t out_sz2 = 1;
|
||
|
|
int have_sc = 0;
|
||
|
|
|
||
|
|
if (cig_len) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
switch (CRAM_MAJOR_VERS(fd->version)) {
|
||
|
|
case 1:
|
||
|
|
if (ds & CRAM_IN) {
|
||
|
|
r |= codecs[DS_IN]
|
||
|
|
? codecs[DS_IN]->decode(s, codecs[DS_IN],
|
||
|
|
blk,
|
||
|
|
cr->len ? &seq[pos-1] : NULL,
|
||
|
|
&out_sz2)
|
||
|
|
: (seq[pos-1] = 'N', out_sz2 = 1, 0);
|
||
|
|
have_sc = 1;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
case 2:
|
||
|
|
default:
|
||
|
|
if (ds & CRAM_SC) {
|
||
|
|
r |= codecs[DS_SC]
|
||
|
|
? codecs[DS_SC]->decode(s, codecs[DS_SC],
|
||
|
|
blk,
|
||
|
|
cr->len ? &seq[pos-1] : NULL,
|
||
|
|
&out_sz2)
|
||
|
|
: (seq[pos-1] = 'N', out_sz2 = 1, 0);
|
||
|
|
have_sc = 1;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
//default:
|
||
|
|
// r |= codecs[DS_BB]
|
||
|
|
// ? codecs[DS_BB]->decode(s, codecs[DS_BB],
|
||
|
|
// blk, &seq[pos-1], &out_sz2)
|
||
|
|
// : (seq[pos-1] = 'N', out_sz2 = 1, 0);
|
||
|
|
}
|
||
|
|
if (have_sc) {
|
||
|
|
if (r) return r;
|
||
|
|
cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
|
||
|
|
cig_op = BAM_CSOFT_CLIP;
|
||
|
|
seq_pos += out_sz2;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'X': { // Substitution; BS
|
||
|
|
unsigned char base;
|
||
|
|
#ifdef USE_X
|
||
|
|
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_BS) {
|
||
|
|
if (!codecs[DS_BS]) return -1;
|
||
|
|
r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
|
||
|
|
(char *)&base, &out_sz);
|
||
|
|
if (pos-1 < cr->len)
|
||
|
|
seq[pos-1] = 'N'; // FIXME look up BS=base value
|
||
|
|
}
|
||
|
|
cig_op = BAM_CBASE_MISMATCH;
|
||
|
|
#else
|
||
|
|
int ref_base;
|
||
|
|
if (cig_len && cig_op != BAM_CMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_BS) {
|
||
|
|
if (!codecs[DS_BS]) return -1;
|
||
|
|
r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
|
||
|
|
(char *)&base, &out_sz);
|
||
|
|
if (r) return -1;
|
||
|
|
if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
|
||
|
|
if (pos-1 < cr->len)
|
||
|
|
seq[pos-1] = c->comp_hdr->
|
||
|
|
substitution_matrix[fd->L1['N']][base];
|
||
|
|
if (decode_md || decode_nm) {
|
||
|
|
if (md_dist >= 0 && decode_md)
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
||
|
|
md_dist = -1;
|
||
|
|
nm--;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
unsigned char ref_call = ref_pos < s->ref_end
|
||
|
|
? (uc)s->ref[ref_pos - s->ref_start +1]
|
||
|
|
: 'N';
|
||
|
|
ref_base = fd->L1[ref_call];
|
||
|
|
if (pos-1 < cr->len)
|
||
|
|
seq[pos-1] = c->comp_hdr->
|
||
|
|
substitution_matrix[ref_base][base];
|
||
|
|
if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
cig_op = BAM_CMATCH;
|
||
|
|
#endif
|
||
|
|
nm++;
|
||
|
|
cig_len++;
|
||
|
|
seq_pos++;
|
||
|
|
ref_pos++;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'D': { // Deletion; DL
|
||
|
|
if (cig_len && cig_op != BAM_CDEL) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_DL) {
|
||
|
|
if (!codecs[DS_DL]) return -1;
|
||
|
|
r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
if (decode_md || decode_nm) {
|
||
|
|
if (ref_pos + i32 > s->ref_end)
|
||
|
|
goto beyond_slice;
|
||
|
|
if (md_dist >= 0 && decode_md)
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
||
|
|
if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
|
||
|
|
if (decode_md) {
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk, '^');
|
||
|
|
BLOCK_APPEND(s->aux_blk,
|
||
|
|
&s->ref[ref_pos - s->ref_start +1],
|
||
|
|
i32);
|
||
|
|
md_dist = 0;
|
||
|
|
}
|
||
|
|
nm += i32;
|
||
|
|
} else {
|
||
|
|
uint32_t dlen;
|
||
|
|
if (bfd->ref[cr->ref_id].len >= ref_pos) {
|
||
|
|
if (decode_md) {
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk, '^');
|
||
|
|
BLOCK_APPEND(s->aux_blk,
|
||
|
|
&s->ref[ref_pos - s->ref_start+1],
|
||
|
|
bfd->ref[cr->ref_id].len-ref_pos);
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, 0);
|
||
|
|
}
|
||
|
|
dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
|
||
|
|
nm += i32 - dlen;
|
||
|
|
} else {
|
||
|
|
dlen = i32;
|
||
|
|
}
|
||
|
|
|
||
|
|
md_dist = -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
cig_op = BAM_CDEL;
|
||
|
|
cig_len += i32;
|
||
|
|
ref_pos += i32;
|
||
|
|
//printf(" %d: DL = %d (ret %d)\n", f, i32, r);
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'I': { // Insertion (several bases); IN
|
||
|
|
int32_t out_sz2 = 1;
|
||
|
|
|
||
|
|
if (cig_len && cig_op != BAM_CINS) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_IN) {
|
||
|
|
if (!codecs[DS_IN]) return -1;
|
||
|
|
r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
|
||
|
|
cr->len ? &seq[pos-1] : NULL,
|
||
|
|
&out_sz2);
|
||
|
|
if (r) return r;
|
||
|
|
cig_op = BAM_CINS;
|
||
|
|
cig_len += out_sz2;
|
||
|
|
seq_pos += out_sz2;
|
||
|
|
nm += out_sz2;
|
||
|
|
//printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'i': { // Insertion (single base); BA
|
||
|
|
if (cig_len && cig_op != BAM_CINS) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_BA) {
|
||
|
|
if (!codecs[DS_BA]) return -1;
|
||
|
|
r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
|
||
|
|
cr->len ? &seq[pos-1] : NULL,
|
||
|
|
&out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
}
|
||
|
|
cig_op = BAM_CINS;
|
||
|
|
cig_len++;
|
||
|
|
seq_pos++;
|
||
|
|
nm++;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'b': { // Several bases
|
||
|
|
int32_t len = 1;
|
||
|
|
|
||
|
|
if (cig_len && cig_op != BAM_CMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_BB) {
|
||
|
|
if (!codecs[DS_BB]) return -1;
|
||
|
|
r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
|
||
|
|
cr->len ? &seq[pos-1] : NULL,
|
||
|
|
&len);
|
||
|
|
if (r) return r;
|
||
|
|
|
||
|
|
if (decode_md || decode_nm) {
|
||
|
|
int x;
|
||
|
|
if (md_dist >= 0 && decode_md)
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
||
|
|
|
||
|
|
for (x = 0; x < len; x++) {
|
||
|
|
if (x && decode_md)
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, 0);
|
||
|
|
if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
|
||
|
|
md_dist = -1;
|
||
|
|
break;
|
||
|
|
} else {
|
||
|
|
if (decode_md) {
|
||
|
|
if (ref_pos + x > s->ref_end)
|
||
|
|
goto beyond_slice;
|
||
|
|
char r = s->ref[ref_pos+x-s->ref_start +1];
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk, r);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
nm += x;
|
||
|
|
md_dist = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cig_op = BAM_CMATCH;
|
||
|
|
|
||
|
|
cig_len+=len;
|
||
|
|
seq_pos+=len;
|
||
|
|
ref_pos+=len;
|
||
|
|
//prev_pos+=len;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'q': { // Several quality values
|
||
|
|
int32_t len = 1;
|
||
|
|
|
||
|
|
if (cig_len && cig_op != BAM_CMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_QQ) {
|
||
|
|
if (!codecs[DS_QQ]) return -1;
|
||
|
|
if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
|
||
|
|
&& (unsigned char)*qual == 255)
|
||
|
|
memset(qual, 30, cr->len); // ?
|
||
|
|
r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
|
||
|
|
(char *)&qual[pos-1], &len);
|
||
|
|
if (r) return r;
|
||
|
|
}
|
||
|
|
|
||
|
|
cig_op = BAM_CMATCH;
|
||
|
|
|
||
|
|
//prev_pos+=len;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'B': { // Read base; BA, QS
|
||
|
|
#ifdef USE_X
|
||
|
|
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
if (cig_len && cig_op != BAM_CMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
if (ds & CRAM_BA) {
|
||
|
|
if (!codecs[DS_BA]) return -1;
|
||
|
|
r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
|
||
|
|
cr->len ? &seq[pos-1] : NULL,
|
||
|
|
&out_sz);
|
||
|
|
|
||
|
|
if (decode_md || decode_nm) {
|
||
|
|
if (md_dist >= 0 && decode_md)
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
||
|
|
if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
|
||
|
|
md_dist = -1;
|
||
|
|
} else {
|
||
|
|
if (decode_md) {
|
||
|
|
if (ref_pos > s->ref_end)
|
||
|
|
goto beyond_slice;
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk,
|
||
|
|
s->ref[ref_pos-s->ref_start +1]);
|
||
|
|
}
|
||
|
|
nm++;
|
||
|
|
md_dist = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (ds & CRAM_QS) {
|
||
|
|
if (!codecs[DS_QS]) return -1;
|
||
|
|
if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
|
||
|
|
&& (unsigned char)*qual == 255)
|
||
|
|
memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk
|
||
|
|
r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
|
||
|
|
(char *)&qual[pos-1], &out_sz);
|
||
|
|
}
|
||
|
|
#ifdef USE_X
|
||
|
|
cig_op = BAM_CBASE_MISMATCH;
|
||
|
|
#else
|
||
|
|
cig_op = BAM_CMATCH;
|
||
|
|
#endif
|
||
|
|
cig_len++;
|
||
|
|
seq_pos++;
|
||
|
|
ref_pos++;
|
||
|
|
//printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'Q': { // Quality score; QS
|
||
|
|
if (ds & CRAM_QS) {
|
||
|
|
if (!codecs[DS_QS]) return -1;
|
||
|
|
if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
|
||
|
|
(unsigned char)*qual == 255)
|
||
|
|
memset(qual, 30, cr->len); // ?
|
||
|
|
r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
|
||
|
|
(char *)&qual[pos-1], &out_sz);
|
||
|
|
//printf(" %d: QS = %d (ret %d)\n", f, qc, r);
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'H': { // hard clip; HC
|
||
|
|
if (cig_len && cig_op != BAM_CHARD_CLIP) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_HC) {
|
||
|
|
if (!codecs[DS_HC]) return -1;
|
||
|
|
r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
cig_op = BAM_CHARD_CLIP;
|
||
|
|
cig_len += i32;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'P': { // padding; PD
|
||
|
|
if (cig_len && cig_op != BAM_CPAD) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_PD) {
|
||
|
|
if (!codecs[DS_PD]) return -1;
|
||
|
|
r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
cig_op = BAM_CPAD;
|
||
|
|
cig_len += i32;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'N': { // Ref skip; RS
|
||
|
|
if (cig_len && cig_op != BAM_CREF_SKIP) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
if (ds & CRAM_RS) {
|
||
|
|
if (!codecs[DS_RS]) return -1;
|
||
|
|
r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
if (r) return r;
|
||
|
|
cig_op = BAM_CREF_SKIP;
|
||
|
|
cig_len += i32;
|
||
|
|
ref_pos += i32;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_error("Unknown feature code '%c'", op);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(ds & CRAM_FC))
|
||
|
|
goto skip_cigar;
|
||
|
|
|
||
|
|
/* An implicit match op for any unaccounted for bases */
|
||
|
|
if ((ds & CRAM_FN) && cr->len >= seq_pos) {
|
||
|
|
if (s->ref && cr->ref_id >= 0) {
|
||
|
|
if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
|
||
|
|
static int whinged = 0;
|
||
|
|
int rlen;
|
||
|
|
if (!whinged)
|
||
|
|
hts_log_warning("Ref pos outside of ref sequence boundary");
|
||
|
|
whinged = 1;
|
||
|
|
rlen = bfd->ref[cr->ref_id].len - ref_pos;
|
||
|
|
// May miss MD/NM cases where both seq/ref are N, but this is a
|
||
|
|
// malformed cram file anyway.
|
||
|
|
if (rlen > 0) {
|
||
|
|
if (seq_pos-1 + rlen < cr->len)
|
||
|
|
memcpy(&seq[seq_pos-1],
|
||
|
|
&s->ref[ref_pos - s->ref_start +1], rlen);
|
||
|
|
if ((cr->len - seq_pos + 1) - rlen > 0)
|
||
|
|
memset(&seq[seq_pos-1+rlen], 'N',
|
||
|
|
(cr->len - seq_pos + 1) - rlen);
|
||
|
|
} else {
|
||
|
|
if (cr->len - seq_pos + 1 > 0)
|
||
|
|
memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
|
||
|
|
}
|
||
|
|
if (md_dist >= 0)
|
||
|
|
md_dist += cr->len - seq_pos + 1;
|
||
|
|
} else {
|
||
|
|
if (cr->len - seq_pos + 1 > 0) {
|
||
|
|
if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
|
||
|
|
goto beyond_slice;
|
||
|
|
int remainder = cr->len - (seq_pos-1);
|
||
|
|
int j = ref_pos - s->ref_start + 1;
|
||
|
|
if (decode_md || decode_nm) {
|
||
|
|
int i;
|
||
|
|
char *N = memchr(&s->ref[j], 'N', remainder);
|
||
|
|
if (!N) {
|
||
|
|
// short cut the common case
|
||
|
|
md_dist += cr->len - (seq_pos-1);
|
||
|
|
} else {
|
||
|
|
char *refp = &s->ref[j-(seq_pos-1)];
|
||
|
|
md_dist += N-&s->ref[j];
|
||
|
|
int i_start = seq_pos-1 + (N - &s->ref[j]);
|
||
|
|
for (i = i_start; i < cr->len; i++) {
|
||
|
|
char base = refp[i];
|
||
|
|
if (base == 'N') {
|
||
|
|
if (add_md_char(s, decode_md, 'N',
|
||
|
|
&md_dist) < 0)
|
||
|
|
return -1;
|
||
|
|
nm++;
|
||
|
|
} else {
|
||
|
|
md_dist++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
|
||
|
|
}
|
||
|
|
ref_pos += cr->len - seq_pos + 1;
|
||
|
|
}
|
||
|
|
} else if (cr->ref_id >= 0) {
|
||
|
|
// So alignment end can be computed even when not decoding sequence
|
||
|
|
ref_pos += cr->len - seq_pos + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ncigar+1 >= cigar_alloc) {
|
||
|
|
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
|
||
|
|
if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
|
||
|
|
return -1;
|
||
|
|
s->cigar = cigar;
|
||
|
|
}
|
||
|
|
#ifdef USE_X
|
||
|
|
if (cig_len && cig_op != BAM_CBASE_MATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
cig_op = BAM_CBASE_MATCH;
|
||
|
|
#else
|
||
|
|
if (cig_len && cig_op != BAM_CMATCH) {
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
cig_len = 0;
|
||
|
|
}
|
||
|
|
cig_op = BAM_CMATCH;
|
||
|
|
#endif
|
||
|
|
cig_len += cr->len - seq_pos+1;
|
||
|
|
}
|
||
|
|
|
||
|
|
skip_cigar:
|
||
|
|
|
||
|
|
if ((ds & CRAM_FN) && decode_md) {
|
||
|
|
if (md_dist >= 0)
|
||
|
|
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (cig_len) {
|
||
|
|
if (ncigar >= cigar_alloc) {
|
||
|
|
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
|
||
|
|
if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
|
||
|
|
return -1;
|
||
|
|
s->cigar = cigar;
|
||
|
|
}
|
||
|
|
|
||
|
|
cigar[ncigar++] = (cig_len<<4) + cig_op;
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->ncigar = ncigar - cr->cigar;
|
||
|
|
cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
|
||
|
|
|
||
|
|
//printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
|
||
|
|
|
||
|
|
if (ds & CRAM_MQ) {
|
||
|
|
if (!codecs[DS_MQ]) return -1;
|
||
|
|
r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
|
||
|
|
(char *)&cr->mqual, &out_sz);
|
||
|
|
} else {
|
||
|
|
cr->mqual = 40;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
||
|
|
int32_t out_sz2 = cr->len;
|
||
|
|
|
||
|
|
if (!codecs[DS_QS]) return -1;
|
||
|
|
r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
|
||
|
|
qual, &out_sz2);
|
||
|
|
}
|
||
|
|
|
||
|
|
s->cigar = cigar;
|
||
|
|
s->cigar_alloc = cigar_alloc;
|
||
|
|
s->ncigar = ncigar;
|
||
|
|
|
||
|
|
if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
|
||
|
|
cr->len = 0;
|
||
|
|
|
||
|
|
if (decode_md) {
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
|
||
|
|
size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
|
||
|
|
if (has_MD < 0) {
|
||
|
|
// has_MD < 0; already have MDZ allocated in aux at -has_MD,
|
||
|
|
// but wrote MD to end of aux (at orig_aux).
|
||
|
|
// We need some memmoves to shuffle it around.
|
||
|
|
char tmp_MD_[1024], *tmp_MD = tmp_MD_;
|
||
|
|
unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
|
||
|
|
if (sz > 1024) {
|
||
|
|
tmp_MD = malloc(sz);
|
||
|
|
if (!tmp_MD)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
memcpy(tmp_MD, orig_aux_p, sz);
|
||
|
|
memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
|
||
|
|
&BLOCK_DATA(s->aux_blk)[-has_MD],
|
||
|
|
orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
|
||
|
|
memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
|
||
|
|
if (tmp_MD != tmp_MD_)
|
||
|
|
free(tmp_MD);
|
||
|
|
|
||
|
|
if (-has_NM > -has_MD)
|
||
|
|
// we inserted before NM, so move it up a bit
|
||
|
|
has_NM -= sz;
|
||
|
|
}
|
||
|
|
// else has_MD == 0 and we've already appended MD to the end.
|
||
|
|
|
||
|
|
cr->aux_size += sz;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (decode_nm) {
|
||
|
|
if (has_NM == 0) {
|
||
|
|
char buf[7];
|
||
|
|
size_t buf_size;
|
||
|
|
buf[0] = 'N'; buf[1] = 'M';
|
||
|
|
if (nm <= UINT8_MAX) {
|
||
|
|
buf_size = 4;
|
||
|
|
buf[2] = 'C';
|
||
|
|
buf[3] = (nm>> 0) & 0xff;
|
||
|
|
} else if (nm <= UINT16_MAX) {
|
||
|
|
buf_size = 5;
|
||
|
|
buf[2] = 'S';
|
||
|
|
buf[3] = (nm>> 0) & 0xff;
|
||
|
|
buf[4] = (nm>> 8) & 0xff;
|
||
|
|
} else {
|
||
|
|
buf_size = 7;
|
||
|
|
buf[2] = 'I';
|
||
|
|
buf[3] = (nm>> 0) & 0xff;
|
||
|
|
buf[4] = (nm>> 8) & 0xff;
|
||
|
|
buf[5] = (nm>>16) & 0xff;
|
||
|
|
buf[6] = (nm>>24) & 0xff;
|
||
|
|
}
|
||
|
|
BLOCK_APPEND(s->aux_blk, buf, buf_size);
|
||
|
|
cr->aux_size += buf_size;
|
||
|
|
} else {
|
||
|
|
// Preallocated space for NM at -has_NM into aux block
|
||
|
|
unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
|
||
|
|
buf[0] = (nm>> 0) & 0xff;
|
||
|
|
buf[1] = (nm>> 8) & 0xff;
|
||
|
|
buf[2] = (nm>>16) & 0xff;
|
||
|
|
buf[3] = (nm>>24) & 0xff;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return r;
|
||
|
|
|
||
|
|
beyond_slice:
|
||
|
|
// Cramtools can create CRAMs that have sequence features outside the
|
||
|
|
// stated range of the container & slice reference extents (start + span).
|
||
|
|
// We have to check for these in many places, but for brevity have the
|
||
|
|
// error reporting in only one.
|
||
|
|
hts_log_error("CRAM CIGAR extends beyond slice reference extents");
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Quick and simple hash lookup for cram_map arrays
|
||
|
|
*/
|
||
|
|
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
|
||
|
|
cram_map *m;
|
||
|
|
|
||
|
|
m = map[CRAM_MAP(key[0],key[1])];
|
||
|
|
while (m && m->key != id)
|
||
|
|
m= m->next;
|
||
|
|
|
||
|
|
return m;
|
||
|
|
}
|
||
|
|
|
||
|
|
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
|
||
|
|
|
||
|
|
|
||
|
|
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
|
||
|
|
cram_block *blk, cram_record *cr) {
|
||
|
|
int i, r = 0, out_sz = 1;
|
||
|
|
unsigned char ntags;
|
||
|
|
|
||
|
|
if (!c->comp_hdr->codecs[DS_TC]) return -1;
|
||
|
|
r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
|
||
|
|
(char *)&ntags, &out_sz);
|
||
|
|
cr->ntags = ntags;
|
||
|
|
|
||
|
|
//printf("TC=%d\n", cr->ntags);
|
||
|
|
cr->aux_size = 0;
|
||
|
|
cr->aux = BLOCK_SIZE(s->aux_blk);
|
||
|
|
|
||
|
|
for (i = 0; i < cr->ntags; i++) {
|
||
|
|
int32_t id, out_sz = 1;
|
||
|
|
unsigned char tag_data[3];
|
||
|
|
cram_map *m;
|
||
|
|
|
||
|
|
//printf("Tag %d/%d\n", i+1, cr->ntags);
|
||
|
|
if (!c->comp_hdr->codecs[DS_TN]) return -1;
|
||
|
|
r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
|
||
|
|
blk, (char *)&id, &out_sz);
|
||
|
|
if (out_sz == 3) {
|
||
|
|
// Tag name stored as 3 chars instead of an int?
|
||
|
|
memcpy(tag_data, &id, 3);
|
||
|
|
} else {
|
||
|
|
tag_data[0] = (id>>16) & 0xff;
|
||
|
|
tag_data[1] = (id>>8) & 0xff;
|
||
|
|
tag_data[2] = id & 0xff;
|
||
|
|
}
|
||
|
|
|
||
|
|
m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
|
||
|
|
if (!m)
|
||
|
|
return -1;
|
||
|
|
BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
|
||
|
|
|
||
|
|
if (!m->codec) return -1;
|
||
|
|
r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
|
||
|
|
|
||
|
|
cr->aux_size += out_sz + 3;
|
||
|
|
}
|
||
|
|
|
||
|
|
return r;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// has_MD and has_NM are filled out with 0 for none present,
|
||
|
|
// 1 for present and verbatim, and -pos for present as placeholder
|
||
|
|
// (MD*, NM*) to be generated and filled out at offset +pos.
|
||
|
|
static int cram_decode_aux(cram_fd *fd,
|
||
|
|
cram_container *c, cram_slice *s,
|
||
|
|
cram_block *blk, cram_record *cr,
|
||
|
|
int *has_MD, int *has_NM) {
|
||
|
|
int i, r = 0, out_sz = 1;
|
||
|
|
int32_t TL = 0;
|
||
|
|
unsigned char *TN;
|
||
|
|
uint32_t ds = s->data_series;
|
||
|
|
|
||
|
|
if (!(ds & (CRAM_TL|CRAM_aux))) {
|
||
|
|
cr->aux = 0;
|
||
|
|
cr->aux_size = 0;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!c->comp_hdr->codecs[DS_TL]) return -1;
|
||
|
|
r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
|
||
|
|
(char *)&TL, &out_sz);
|
||
|
|
if (r || TL < 0 || TL >= c->comp_hdr->nTL)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
TN = c->comp_hdr->TL[TL];
|
||
|
|
cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
|
||
|
|
|
||
|
|
//printf("TC=%d\n", cr->ntags);
|
||
|
|
cr->aux_size = 0;
|
||
|
|
cr->aux = BLOCK_SIZE(s->aux_blk);
|
||
|
|
|
||
|
|
if (!(ds & CRAM_aux))
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
for (i = 0; i < cr->ntags; i++) {
|
||
|
|
int32_t id, out_sz = 1;
|
||
|
|
unsigned char tag_data[7];
|
||
|
|
cram_map *m;
|
||
|
|
|
||
|
|
if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
|
||
|
|
*has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
|
||
|
|
if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
|
||
|
|
*has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
|
||
|
|
|
||
|
|
//printf("Tag %d/%d\n", i+1, cr->ntags);
|
||
|
|
tag_data[0] = TN[0];
|
||
|
|
tag_data[1] = TN[1];
|
||
|
|
tag_data[2] = TN[2];
|
||
|
|
id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
|
||
|
|
// Place holder, fill out contents later.
|
||
|
|
int tag_data_size;
|
||
|
|
if (TN[0] == 'N' && TN[1] == 'M') {
|
||
|
|
// Use a fixed size, so we can allocate room for it now.
|
||
|
|
memcpy(&tag_data[2], "I\0\0\0\0", 5);
|
||
|
|
tag_data_size = 7;
|
||
|
|
} else if (TN[0] == 'R' && TN[1] == 'G') {
|
||
|
|
// RG is variable size, but known already. Insert now
|
||
|
|
TN += 3;
|
||
|
|
// Equiv to fd->header->hrecs->rg[cr->rg], but this is the
|
||
|
|
// new header API equivalent.
|
||
|
|
const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
|
||
|
|
if (!rg)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
size_t rg_len = strlen(rg);
|
||
|
|
tag_data[2] = 'Z';
|
||
|
|
BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
|
||
|
|
BLOCK_APPEND(s->aux_blk, rg, rg_len);
|
||
|
|
BLOCK_APPEND_CHAR(s->aux_blk, '\0');
|
||
|
|
cr->aux_size += 3 + rg_len + 1;
|
||
|
|
cr->rg = -1; // prevents auto-add later
|
||
|
|
continue;
|
||
|
|
} else {
|
||
|
|
// Unknown size. We'll insert MD into stream later.
|
||
|
|
tag_data[2] = 'Z';
|
||
|
|
tag_data_size = 3;
|
||
|
|
}
|
||
|
|
BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
|
||
|
|
cr->aux_size += tag_data_size;
|
||
|
|
TN += 3;
|
||
|
|
} else {
|
||
|
|
TN += 3;
|
||
|
|
m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
|
||
|
|
if (!m)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
|
||
|
|
|
||
|
|
if (!m->codec) return -1;
|
||
|
|
r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
|
||
|
|
if (r) break;
|
||
|
|
cr->aux_size += out_sz + 3;
|
||
|
|
|
||
|
|
// cF CRAM flags.
|
||
|
|
if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
|
||
|
|
// Remove cF tag
|
||
|
|
uint8_t cF = BLOCK_END(s->aux_blk)[-1];
|
||
|
|
BLOCK_SIZE(s->aux_blk) -= out_sz+3;
|
||
|
|
cr->aux_size -= out_sz+3;
|
||
|
|
|
||
|
|
// bit 1 => don't auto-decode MD.
|
||
|
|
// Pretend MD is present verbatim, so we don't auto-generate
|
||
|
|
if ((cF & 1) && has_MD && *has_MD == 0)
|
||
|
|
*has_MD = 1;
|
||
|
|
|
||
|
|
// bit 1 => don't auto-decode NM
|
||
|
|
if ((cF & 2) && has_NM && *has_NM == 0)
|
||
|
|
*has_NM = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// We could go to 2^32 fine, but we shouldn't be hitting this anyway,
|
||
|
|
// and it's protecting against memory hogs too.
|
||
|
|
if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
|
||
|
|
hts_log_error("CRAM->BAM aux block size overflow");
|
||
|
|
goto block_err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return r;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Resolve mate pair cross-references between recs within this slice */
|
||
|
|
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
|
||
|
|
int rec;
|
||
|
|
|
||
|
|
if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
|
||
|
|
for (rec = 0; rec < s->hdr->num_records; rec++) {
|
||
|
|
cram_record *cr = &s->crecs[rec];
|
||
|
|
|
||
|
|
cr->tlen = 0;
|
||
|
|
cr->mate_pos = 0;
|
||
|
|
cr->mate_ref_id = -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (rec = 0; rec < s->hdr->num_records; rec++) {
|
||
|
|
cram_record *cr = &s->crecs[rec];
|
||
|
|
|
||
|
|
if (cr->mate_line >= 0) {
|
||
|
|
if (cr->mate_line < s->hdr->num_records) {
|
||
|
|
/*
|
||
|
|
* On the first read, loop through computing lengths.
|
||
|
|
* It's not perfect as we have one slice per reference so we
|
||
|
|
* cannot detect when TLEN should be zero due to seqs that
|
||
|
|
* map to multiple references.
|
||
|
|
*
|
||
|
|
* We also cannot set tlen correct when it spans a slice for
|
||
|
|
* other reasons. This may make tlen too small. Should we
|
||
|
|
* fix this by forcing TLEN to be stored verbatim in such cases?
|
||
|
|
*
|
||
|
|
* Or do we just admit defeat and output 0 for tlen? It's the
|
||
|
|
* safe option...
|
||
|
|
*/
|
||
|
|
if (cr->tlen == INT64_MIN) {
|
||
|
|
int id1 = rec, id2 = rec;
|
||
|
|
int64_t aleft = cr->apos, aright = cr->aend;
|
||
|
|
int64_t tlen;
|
||
|
|
int ref = cr->ref_id;
|
||
|
|
|
||
|
|
// number of segments starting at the same point.
|
||
|
|
int left_cnt = 0;
|
||
|
|
|
||
|
|
do {
|
||
|
|
if (aleft > s->crecs[id2].apos)
|
||
|
|
aleft = s->crecs[id2].apos, left_cnt = 1;
|
||
|
|
else if (aleft == s->crecs[id2].apos)
|
||
|
|
left_cnt++;
|
||
|
|
if (aright < s->crecs[id2].aend)
|
||
|
|
aright = s->crecs[id2].aend;
|
||
|
|
if (s->crecs[id2].mate_line == -1) {
|
||
|
|
s->crecs[id2].mate_line = rec;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (s->crecs[id2].mate_line <= id2 ||
|
||
|
|
s->crecs[id2].mate_line >= s->hdr->num_records)
|
||
|
|
return -1;
|
||
|
|
id2 = s->crecs[id2].mate_line;
|
||
|
|
|
||
|
|
if (s->crecs[id2].ref_id != ref)
|
||
|
|
ref = -1;
|
||
|
|
} while (id2 != id1);
|
||
|
|
|
||
|
|
if (ref != -1) {
|
||
|
|
tlen = aright - aleft + 1;
|
||
|
|
id1 = id2 = rec;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* When we have two seqs with identical start and
|
||
|
|
* end coordinates, set +/- tlen based on 1st/last
|
||
|
|
* bit flags instead, as a tie breaker.
|
||
|
|
*/
|
||
|
|
if (s->crecs[id2].apos == aleft) {
|
||
|
|
if (left_cnt == 1 ||
|
||
|
|
(s->crecs[id2].flags & BAM_FREAD1))
|
||
|
|
s->crecs[id2].tlen = tlen;
|
||
|
|
else
|
||
|
|
s->crecs[id2].tlen = -tlen;
|
||
|
|
} else {
|
||
|
|
s->crecs[id2].tlen = -tlen;
|
||
|
|
}
|
||
|
|
|
||
|
|
id2 = s->crecs[id2].mate_line;
|
||
|
|
while (id2 != id1) {
|
||
|
|
if (s->crecs[id2].apos == aleft) {
|
||
|
|
if (left_cnt == 1 ||
|
||
|
|
(s->crecs[id2].flags & BAM_FREAD1))
|
||
|
|
s->crecs[id2].tlen = tlen;
|
||
|
|
else
|
||
|
|
s->crecs[id2].tlen = -tlen;
|
||
|
|
} else {
|
||
|
|
s->crecs[id2].tlen = -tlen;
|
||
|
|
}
|
||
|
|
id2 = s->crecs[id2].mate_line;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
id1 = id2 = rec;
|
||
|
|
|
||
|
|
s->crecs[id2].tlen = 0;
|
||
|
|
id2 = s->crecs[id2].mate_line;
|
||
|
|
while (id2 != id1) {
|
||
|
|
s->crecs[id2].tlen = 0;
|
||
|
|
id2 = s->crecs[id2].mate_line;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->mate_pos = s->crecs[cr->mate_line].apos;
|
||
|
|
cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
|
||
|
|
|
||
|
|
// paired
|
||
|
|
cr->flags |= BAM_FPAIRED;
|
||
|
|
|
||
|
|
// set mate unmapped if needed
|
||
|
|
if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
|
||
|
|
cr->flags |= BAM_FMUNMAP;
|
||
|
|
cr->tlen = 0;
|
||
|
|
}
|
||
|
|
if (cr->flags & BAM_FUNMAP) {
|
||
|
|
cr->tlen = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// set mate reversed if needed
|
||
|
|
if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
|
||
|
|
cr->flags |= BAM_FMREVERSE;
|
||
|
|
} else {
|
||
|
|
hts_log_error("Mate line out of bounds: %d vs [0, %d]",
|
||
|
|
cr->mate_line, s->hdr->num_records-1);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* FIXME: construct read names here too if needed */
|
||
|
|
} else {
|
||
|
|
if (cr->mate_flags & CRAM_M_REVERSE) {
|
||
|
|
cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
|
||
|
|
}
|
||
|
|
if (cr->mate_flags & CRAM_M_UNMAP) {
|
||
|
|
cr->flags |= BAM_FMUNMAP;
|
||
|
|
//cr->mate_ref_id = -1;
|
||
|
|
}
|
||
|
|
if (!(cr->flags & BAM_FPAIRED))
|
||
|
|
cr->mate_ref_id = -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (cr->tlen == INT64_MIN)
|
||
|
|
cr->tlen = 0; // Just incase
|
||
|
|
}
|
||
|
|
|
||
|
|
for (rec = 0; rec < s->hdr->num_records; rec++) {
|
||
|
|
cram_record *cr = &s->crecs[rec];
|
||
|
|
if (cr->explicit_tlen != INT64_MIN)
|
||
|
|
cr->tlen = cr->explicit_tlen;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static char *md5_print(unsigned char *md5, char *out) {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < 16; i++) {
|
||
|
|
out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
|
||
|
|
out[i*2+1] = "0123456789abcdef"[md5[i]&15];
|
||
|
|
}
|
||
|
|
out[32] = 0;
|
||
|
|
|
||
|
|
return out;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Utility function to decode tlen (ISIZE), as it's called
|
||
|
|
* in multiple places.
|
||
|
|
*
|
||
|
|
* Returns codec return value (0 on success).
|
||
|
|
*/
|
||
|
|
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
|
||
|
|
cram_block *blk, int64_t *tlen) {
|
||
|
|
int out_sz = 1, r = 0;
|
||
|
|
|
||
|
|
if (!c->comp_hdr->codecs[DS_TS]) return -1;
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) < 4) {
|
||
|
|
int32_t i32;
|
||
|
|
r |= c->comp_hdr->codecs[DS_TS]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_TS], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
*tlen = i32;
|
||
|
|
} else {
|
||
|
|
r |= c->comp_hdr->codecs[DS_TS]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_TS], blk,
|
||
|
|
(char *)tlen, &out_sz);
|
||
|
|
}
|
||
|
|
return r;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Decode an entire slice from container blocks. Fills out s->crecs[] array.
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
|
||
|
|
sam_hdr_t *sh) {
|
||
|
|
cram_block *blk = s->block[0];
|
||
|
|
int32_t bf, ref_id;
|
||
|
|
unsigned char cf;
|
||
|
|
int out_sz, r = 0;
|
||
|
|
int rec;
|
||
|
|
char *seq = NULL, *qual = NULL;
|
||
|
|
int unknown_rg = -1;
|
||
|
|
int embed_ref;
|
||
|
|
char **refs = NULL;
|
||
|
|
uint32_t ds;
|
||
|
|
sam_hrecs_t *bfd = sh->hrecs;
|
||
|
|
|
||
|
|
if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
ds = s->data_series;
|
||
|
|
|
||
|
|
blk->bit = 7; // MSB first
|
||
|
|
|
||
|
|
// Study the blocks and estimate approx sizes to preallocate.
|
||
|
|
// This looks to speed up decoding by around 8-9%.
|
||
|
|
// We can always shrink back down at the end if we overestimated.
|
||
|
|
// However it's likely that this also saves memory as own growth
|
||
|
|
// factor (*=1.5) is never applied.
|
||
|
|
{
|
||
|
|
int qsize, nsize, q_id;
|
||
|
|
cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
|
||
|
|
//fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
|
||
|
|
|
||
|
|
if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
|
||
|
|
if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
|
||
|
|
if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
|
||
|
|
|
||
|
|
// To do - consider using q_id here to usurp the quality block and
|
||
|
|
// avoid a memcpy during decode.
|
||
|
|
// Specifically when quality is an external block uniquely used by
|
||
|
|
// DS_QS only, then we can set s->qual_blk directly to this
|
||
|
|
// block and save the codec->decode() calls. (Approx 3% cpu saving)
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Look for unknown RG, added as last by Java CRAM? */
|
||
|
|
if (bfd->nrg > 0 &&
|
||
|
|
bfd->rg[bfd->nrg-1].name != NULL &&
|
||
|
|
!strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
|
||
|
|
unknown_rg = bfd->nrg-1;
|
||
|
|
|
||
|
|
if (blk->content_type != CORE)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (s->crecs)
|
||
|
|
free(s->crecs);
|
||
|
|
if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
ref_id = s->hdr->ref_seq_id;
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) < 4)
|
||
|
|
embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
|
||
|
|
else
|
||
|
|
embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
|
||
|
|
|
||
|
|
if (ref_id >= 0) {
|
||
|
|
if (embed_ref) {
|
||
|
|
cram_block *b;
|
||
|
|
if (s->hdr->ref_base_id < 0) {
|
||
|
|
hts_log_error("No reference specified and no embedded reference is available"
|
||
|
|
" at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
|
||
|
|
s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
b = cram_get_block_by_id(s, s->hdr->ref_base_id);
|
||
|
|
if (!b)
|
||
|
|
return -1;
|
||
|
|
if (cram_uncompress_block(b) != 0)
|
||
|
|
return -1;
|
||
|
|
s->ref = (char *)BLOCK_DATA(b);
|
||
|
|
s->ref_start = s->hdr->ref_seq_start;
|
||
|
|
s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
|
||
|
|
if (s->hdr->ref_seq_span > b->uncomp_size) {
|
||
|
|
hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
|
||
|
|
ref_id, s->ref_start, s->ref_end);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
} else if (!c->comp_hdr->no_ref) {
|
||
|
|
//// Avoid Java cramtools bug by loading entire reference seq
|
||
|
|
//s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
|
||
|
|
//s->ref_start = 1;
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_SEQ) {
|
||
|
|
s->ref =
|
||
|
|
cram_get_ref(fd, s->hdr->ref_seq_id,
|
||
|
|
s->hdr->ref_seq_start,
|
||
|
|
s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
|
||
|
|
}
|
||
|
|
s->ref_start = s->hdr->ref_seq_start;
|
||
|
|
s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
|
||
|
|
|
||
|
|
/* Sanity check */
|
||
|
|
if (s->ref_start < 0) {
|
||
|
|
hts_log_warning("Slice starts before base 1"
|
||
|
|
" at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
|
||
|
|
s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
|
||
|
|
s->ref_start = 0;
|
||
|
|
}
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
pthread_mutex_lock(&fd->refs->lock);
|
||
|
|
if ((fd->required_fields & SAM_SEQ) &&
|
||
|
|
ref_id < fd->refs->nref && fd->refs->ref_id &&
|
||
|
|
s->ref_end > fd->refs->ref_id[ref_id]->length) {
|
||
|
|
s->ref_end = fd->refs->ref_id[ref_id]->length;
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->refs->lock);
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ((fd->required_fields & SAM_SEQ) &&
|
||
|
|
s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
|
||
|
|
hts_log_error("Unable to fetch reference #%d:%"PRId64"-%"PRId64"\n",
|
||
|
|
ref_id, s->hdr->ref_seq_start,
|
||
|
|
s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1
|
||
|
|
&& (fd->required_fields & SAM_SEQ)
|
||
|
|
&& s->hdr->ref_seq_id >= 0
|
||
|
|
&& !fd->ignore_md5
|
||
|
|
&& memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
|
||
|
|
hts_md5_context *md5;
|
||
|
|
unsigned char digest[16];
|
||
|
|
|
||
|
|
if (s->ref && s->hdr->ref_seq_id >= 0) {
|
||
|
|
int start, len;
|
||
|
|
|
||
|
|
if (s->hdr->ref_seq_start >= s->ref_start) {
|
||
|
|
start = s->hdr->ref_seq_start - s->ref_start;
|
||
|
|
} else {
|
||
|
|
hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
|
||
|
|
ref_id, s->ref_start, s->ref_end);
|
||
|
|
start = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
|
||
|
|
len = s->hdr->ref_seq_span;
|
||
|
|
} else {
|
||
|
|
hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
|
||
|
|
ref_id, s->ref_start, s->ref_end);
|
||
|
|
len = s->ref_end - s->ref_start + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(md5 = hts_md5_init()))
|
||
|
|
return -1;
|
||
|
|
if (start + len > s->ref_end - s->ref_start + 1)
|
||
|
|
len = s->ref_end - s->ref_start + 1 - start;
|
||
|
|
if (len >= 0)
|
||
|
|
hts_md5_update(md5, s->ref + start, len);
|
||
|
|
hts_md5_final(digest, md5);
|
||
|
|
hts_md5_destroy(md5);
|
||
|
|
} else if (!s->ref && s->hdr->ref_base_id >= 0) {
|
||
|
|
cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
|
||
|
|
if (b) {
|
||
|
|
if (!(md5 = hts_md5_init()))
|
||
|
|
return -1;
|
||
|
|
hts_md5_update(md5, b->data, b->uncomp_size);
|
||
|
|
hts_md5_final(digest, md5);
|
||
|
|
hts_md5_destroy(md5);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!c->comp_hdr->no_ref &&
|
||
|
|
((!s->ref && s->hdr->ref_base_id < 0)
|
||
|
|
|| memcmp(digest, s->hdr->md5, 16) != 0)) {
|
||
|
|
char M[33];
|
||
|
|
const char *rname = sam_hdr_tid2name(sh, ref_id);
|
||
|
|
if (!rname) rname="?"; // cannot happen normally
|
||
|
|
hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
|
||
|
|
rname, s->ref_start, s->ref_end);
|
||
|
|
hts_log_error("CRAM : %s", md5_print(s->hdr->md5, M));
|
||
|
|
hts_log_error("Ref : %s", md5_print(digest, M));
|
||
|
|
kstring_t ks = KS_INITIALIZE;
|
||
|
|
if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
|
||
|
|
hts_log_error("@SQ M5: %s", ks.s);
|
||
|
|
hts_log_error("Please check the reference given is correct");
|
||
|
|
ks_free(&ks);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ref_id == -2) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
pthread_mutex_lock(&fd->refs->lock);
|
||
|
|
refs = calloc(fd->refs->nref, sizeof(char *));
|
||
|
|
pthread_mutex_unlock(&fd->refs->lock);
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
if (!refs)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
|
||
|
|
for (rec = 0; rec < s->hdr->num_records; rec++) {
|
||
|
|
cram_record *cr = &s->crecs[rec];
|
||
|
|
int has_MD, has_NM;
|
||
|
|
|
||
|
|
//fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
|
||
|
|
|
||
|
|
cr->s = s;
|
||
|
|
|
||
|
|
out_sz = 1; /* decode 1 item */
|
||
|
|
if (ds & CRAM_BF) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_BF]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_BF], blk,
|
||
|
|
(char *)&bf, &out_sz);
|
||
|
|
if (r || bf < 0 ||
|
||
|
|
bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
|
||
|
|
goto block_err;
|
||
|
|
bf = fd->bam_flag_swap[bf];
|
||
|
|
cr->flags = bf;
|
||
|
|
} else {
|
||
|
|
cr->flags = bf = 0x4; // unmapped
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_CF) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
/* CF is byte in 1.0, int32 in 2.0 */
|
||
|
|
if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_CF]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_CF], blk,
|
||
|
|
(char *)&cf, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
cr->cram_flags = cf;
|
||
|
|
} else {
|
||
|
|
if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_CF]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_CF], blk,
|
||
|
|
(char *)&cr->cram_flags, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
cf = cr->cram_flags;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
cf = cr->cram_flags = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
|
||
|
|
if (ds & CRAM_RI) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_RI]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_RI], blk,
|
||
|
|
(char *)&cr->ref_id, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
|
||
|
|
&& cr->ref_id >= 0
|
||
|
|
&& cr->ref_id != last_ref_id) {
|
||
|
|
if (!c->comp_hdr->no_ref) {
|
||
|
|
// Range(fd): seq >= 0, unmapped -1, unspecified -2
|
||
|
|
// Slice(s): seq >= 0, unmapped -1, multiple refs -2
|
||
|
|
// Record(cr): seq >= 0, unmapped -1
|
||
|
|
pthread_mutex_lock(&fd->range_lock);
|
||
|
|
int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
|
||
|
|
pthread_mutex_unlock(&fd->range_lock);
|
||
|
|
if (need_ref) {
|
||
|
|
if (!refs[cr->ref_id])
|
||
|
|
refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
|
||
|
|
if (!(s->ref = refs[cr->ref_id]))
|
||
|
|
goto block_err;
|
||
|
|
} else {
|
||
|
|
// For multi-ref containers, we don't need to fetch all
|
||
|
|
// refs if we're only querying one.
|
||
|
|
s->ref = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->range_lock);
|
||
|
|
int discard_last_ref = (last_ref_id >= 0 &&
|
||
|
|
refs[last_ref_id] &&
|
||
|
|
(fd->range.refid == -2 ||
|
||
|
|
last_ref_id == fd->range.refid));
|
||
|
|
pthread_mutex_unlock(&fd->range_lock);
|
||
|
|
if (discard_last_ref) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
discard_last_ref = !fd->unsorted;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
if (discard_last_ref) {
|
||
|
|
cram_ref_decr(fd->refs, last_ref_id);
|
||
|
|
refs[last_ref_id] = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
s->ref_start = 1;
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
pthread_mutex_lock(&fd->refs->lock);
|
||
|
|
s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
|
||
|
|
pthread_mutex_unlock(&fd->refs->lock);
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
|
||
|
|
last_ref_id = cr->ref_id;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
cr->ref_id = -1;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
cr->ref_id = ref_id; // Forced constant in CRAM 1.0
|
||
|
|
}
|
||
|
|
if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
|
||
|
|
hts_log_error("Requested unknown reference ID %d", cr->ref_id);
|
||
|
|
goto block_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_RL) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_RL]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_RL], blk,
|
||
|
|
(char *)&cr->len, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
if (cr->len < 0) {
|
||
|
|
hts_log_error("Read has negative length");
|
||
|
|
goto block_err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_AP) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
r |= c->comp_hdr->codecs[DS_AP]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_AP], blk,
|
||
|
|
(char *)&cr->apos, &out_sz);
|
||
|
|
} else {
|
||
|
|
int32_t i32;
|
||
|
|
r |= c->comp_hdr->codecs[DS_AP]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_AP], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
cr->apos = i32;
|
||
|
|
}
|
||
|
|
if (r) goto block_err;;
|
||
|
|
if (c->comp_hdr->AP_delta) {
|
||
|
|
if (cr->apos < 0 && c->unsorted == 0) {
|
||
|
|
// cache locally in c->unsorted so we don't have an
|
||
|
|
// excessive number of locks
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
c->unsorted = fd->unsorted = 1;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
cr->apos += s->last_apos;
|
||
|
|
}
|
||
|
|
s->last_apos= cr->apos;
|
||
|
|
} else {
|
||
|
|
cr->apos = c->ref_seq_start;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_RG) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_RG]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_RG], blk,
|
||
|
|
(char *)&cr->rg, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
if (cr->rg == unknown_rg)
|
||
|
|
cr->rg = -1;
|
||
|
|
} else {
|
||
|
|
cr->rg = -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->name_len = 0;
|
||
|
|
|
||
|
|
if (c->comp_hdr->read_names_included) {
|
||
|
|
int32_t out_sz2 = 1;
|
||
|
|
|
||
|
|
// Read directly into name cram_block
|
||
|
|
cr->name = BLOCK_SIZE(s->name_blk);
|
||
|
|
if (ds & CRAM_RN) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_RN]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_RN], blk,
|
||
|
|
(char *)s->name_blk, &out_sz2);
|
||
|
|
if (r) goto block_err;
|
||
|
|
cr->name_len = out_sz2;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->mate_pos = 0;
|
||
|
|
cr->mate_line = -1;
|
||
|
|
cr->mate_ref_id = -1;
|
||
|
|
cr->explicit_tlen = INT64_MIN;
|
||
|
|
if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
|
||
|
|
if (ds & CRAM_MF) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
/* MF is byte in 1.0, int32 in 2.0 */
|
||
|
|
unsigned char mf;
|
||
|
|
if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_MF]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_MF],
|
||
|
|
blk, (char *)&mf, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
cr->mate_flags = mf;
|
||
|
|
} else {
|
||
|
|
if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_MF]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_MF],
|
||
|
|
blk,
|
||
|
|
(char *)&cr->mate_flags,
|
||
|
|
&out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
cr->mate_flags = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!c->comp_hdr->read_names_included) {
|
||
|
|
int32_t out_sz2 = 1;
|
||
|
|
|
||
|
|
// Read directly into name cram_block
|
||
|
|
cr->name = BLOCK_SIZE(s->name_blk);
|
||
|
|
if (ds & CRAM_RN) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_RN]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_RN],
|
||
|
|
blk, (char *)s->name_blk,
|
||
|
|
&out_sz2);
|
||
|
|
if (r) goto block_err;
|
||
|
|
cr->name_len = out_sz2;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_NS) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_NS]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_NS], blk,
|
||
|
|
(char *)&cr->mate_ref_id, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
|
||
|
|
// if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
|
||
|
|
// /* Paired, but unmapped */
|
||
|
|
// cr->flags |= BAM_FMUNMAP;
|
||
|
|
// }
|
||
|
|
|
||
|
|
if (ds & CRAM_NP) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) < 4) {
|
||
|
|
int32_t i32;
|
||
|
|
r |= c->comp_hdr->codecs[DS_NP]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_NP], blk,
|
||
|
|
(char *)&i32, &out_sz);
|
||
|
|
cr->mate_pos = i32;
|
||
|
|
} else {
|
||
|
|
r |= c->comp_hdr->codecs[DS_NP]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_NP], blk,
|
||
|
|
(char *)&cr->mate_pos, &out_sz);
|
||
|
|
}
|
||
|
|
if (r) goto block_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ds & CRAM_TS) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
|
||
|
|
r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
|
||
|
|
if (r) goto block_err;
|
||
|
|
} else {
|
||
|
|
cr->tlen = INT64_MIN;
|
||
|
|
}
|
||
|
|
} else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
|
||
|
|
// else not detached
|
||
|
|
if (ds & CRAM_NF) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_NF]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_NF], blk,
|
||
|
|
(char *)&cr->mate_line, &out_sz);
|
||
|
|
if (r) goto block_err;
|
||
|
|
cr->mate_line += rec + 1;
|
||
|
|
|
||
|
|
//cr->name_len = sprintf(name, "%d", name_id++);
|
||
|
|
//cr->name = DSTRING_LEN(name_ds);
|
||
|
|
//dstring_nappend(name_ds, name, cr->name_len);
|
||
|
|
|
||
|
|
cr->mate_ref_id = -1;
|
||
|
|
cr->tlen = INT64_MIN;
|
||
|
|
cr->mate_pos = 0;
|
||
|
|
} else {
|
||
|
|
cr->mate_flags = 0;
|
||
|
|
cr->tlen = INT64_MIN;
|
||
|
|
}
|
||
|
|
if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
|
||
|
|
if (ds & CRAM_TS) {
|
||
|
|
r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
|
||
|
|
if (r) return r;
|
||
|
|
} else {
|
||
|
|
cr->mate_flags = 0;
|
||
|
|
cr->tlen = INT64_MIN;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
|
||
|
|
if (ds & CRAM_TS) {
|
||
|
|
r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
|
||
|
|
if (r) return r;
|
||
|
|
} else {
|
||
|
|
cr->mate_flags = 0;
|
||
|
|
cr->tlen = INT64_MIN;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
cr->mate_flags = 0;
|
||
|
|
cr->tlen = INT64_MIN;
|
||
|
|
}
|
||
|
|
/*
|
||
|
|
else if (!name[0]) {
|
||
|
|
//name[0] = '?'; name[1] = 0;
|
||
|
|
//cr->name_len = 1;
|
||
|
|
//cr->name= DSTRING_LEN(s->name_ds);
|
||
|
|
//dstring_nappend(s->name_ds, "?", 1);
|
||
|
|
|
||
|
|
cr->mate_ref_id = -1;
|
||
|
|
cr->tlen = 0;
|
||
|
|
cr->mate_pos = 0;
|
||
|
|
}
|
||
|
|
*/
|
||
|
|
|
||
|
|
/* Auxiliary tags */
|
||
|
|
has_MD = has_NM = 0;
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1)
|
||
|
|
r |= cram_decode_aux_1_0(c, s, blk, cr);
|
||
|
|
else
|
||
|
|
r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
|
||
|
|
if (r) goto block_err;
|
||
|
|
|
||
|
|
/* Fake up dynamic string growth and appending */
|
||
|
|
if (ds & CRAM_RL) {
|
||
|
|
cr->seq = BLOCK_SIZE(s->seqs_blk);
|
||
|
|
BLOCK_GROW(s->seqs_blk, cr->len);
|
||
|
|
seq = (char *)BLOCK_END(s->seqs_blk);
|
||
|
|
BLOCK_SIZE(s->seqs_blk) += cr->len;
|
||
|
|
|
||
|
|
if (!seq)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
cr->qual = BLOCK_SIZE(s->qual_blk);
|
||
|
|
BLOCK_GROW(s->qual_blk, cr->len);
|
||
|
|
qual = (char *)BLOCK_END(s->qual_blk);
|
||
|
|
BLOCK_SIZE(s->qual_blk) += cr->len;
|
||
|
|
|
||
|
|
if (!s->ref)
|
||
|
|
memset(seq, '=', cr->len);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(bf & BAM_FUNMAP)) {
|
||
|
|
if ((ds & CRAM_AP) && cr->apos <= 0) {
|
||
|
|
hts_log_error("Read has alignment position %"PRId64
|
||
|
|
" but no unmapped flag",
|
||
|
|
cr->apos);
|
||
|
|
goto block_err;
|
||
|
|
}
|
||
|
|
/* Decode sequence and generate CIGAR */
|
||
|
|
if (ds & (CRAM_SEQ | CRAM_MQ)) {
|
||
|
|
r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
|
||
|
|
has_MD, has_NM);
|
||
|
|
if (r) goto block_err;
|
||
|
|
} else {
|
||
|
|
cr->cigar = 0;
|
||
|
|
cr->ncigar = 0;
|
||
|
|
cr->aend = cr->apos;
|
||
|
|
cr->mqual = 0;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
int out_sz2 = cr->len;
|
||
|
|
|
||
|
|
//puts("Unmapped");
|
||
|
|
cr->cigar = 0;
|
||
|
|
cr->ncigar = 0;
|
||
|
|
cr->aend = cr->apos;
|
||
|
|
cr->mqual = 0;
|
||
|
|
|
||
|
|
if (ds & CRAM_BA && cr->len) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_BA]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_BA], blk,
|
||
|
|
(char *)seq, &out_sz2);
|
||
|
|
if (r) goto block_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
||
|
|
out_sz2 = cr->len;
|
||
|
|
if (ds & CRAM_QS && cr->len >= 0) {
|
||
|
|
if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
|
||
|
|
r |= c->comp_hdr->codecs[DS_QS]
|
||
|
|
->decode(s, c->comp_hdr->codecs[DS_QS],
|
||
|
|
blk, qual, &out_sz2);
|
||
|
|
if (r) goto block_err;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
if (ds & CRAM_RL)
|
||
|
|
memset(qual, 255, cr->len);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
|
||
|
|
int i, j;
|
||
|
|
for (i = 0, j = cr->len-1; i < j; i++, j--) {
|
||
|
|
unsigned char c;
|
||
|
|
c = qual[i];
|
||
|
|
qual[i] = qual[j];
|
||
|
|
qual[j] = c;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
if (refs) {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < fd->refs->nref; i++) {
|
||
|
|
if (refs[i])
|
||
|
|
cram_ref_decr(fd->refs, i);
|
||
|
|
}
|
||
|
|
free(refs);
|
||
|
|
refs = NULL;
|
||
|
|
} else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
|
||
|
|
cram_ref_decr(fd->refs, ref_id);
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
|
||
|
|
/* Resolve mate pair cross-references between recs within this slice */
|
||
|
|
r |= cram_decode_slice_xref(s, fd->required_fields);
|
||
|
|
|
||
|
|
// Free the original blocks as we no longer need these.
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < s->hdr->num_blocks; i++) {
|
||
|
|
cram_block *b = s->block[i];
|
||
|
|
cram_free_block(b);
|
||
|
|
s->block[i] = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Also see initial BLOCK_RESIZE_EXACT at top of function.
|
||
|
|
// As we grow blocks we overallocate by up to 50%. So shrink
|
||
|
|
// back to their final sizes here.
|
||
|
|
//
|
||
|
|
//fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
|
||
|
|
// (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
|
||
|
|
// (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
|
||
|
|
// (int)s->name_blk->byte, (int)s->name_blk->alloc,
|
||
|
|
// (int)s->aux_blk->byte, (int)s->aux_blk->alloc);
|
||
|
|
BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
|
||
|
|
BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
|
||
|
|
BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
|
||
|
|
BLOCK_RESIZE_EXACT(s->aux_blk, BLOCK_SIZE(s->aux_blk)+1);
|
||
|
|
|
||
|
|
return r;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
if (refs) {
|
||
|
|
int i;
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
for (i = 0; i < fd->refs->nref; i++) {
|
||
|
|
if (refs[i])
|
||
|
|
cram_ref_decr(fd->refs, i);
|
||
|
|
}
|
||
|
|
free(refs);
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
typedef struct {
|
||
|
|
cram_fd *fd;
|
||
|
|
cram_container *c;
|
||
|
|
cram_slice *s;
|
||
|
|
sam_hdr_t *h;
|
||
|
|
int exit_code;
|
||
|
|
} cram_decode_job;
|
||
|
|
|
||
|
|
void *cram_decode_slice_thread(void *arg) {
|
||
|
|
cram_decode_job *j = (cram_decode_job *)arg;
|
||
|
|
|
||
|
|
j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
|
||
|
|
|
||
|
|
return j;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Spawn a multi-threaded version of cram_decode_slice().
|
||
|
|
*/
|
||
|
|
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
|
||
|
|
sam_hdr_t *bfd) {
|
||
|
|
cram_decode_job *j;
|
||
|
|
int nonblock;
|
||
|
|
|
||
|
|
if (!fd->pool)
|
||
|
|
return cram_decode_slice(fd, c, s, bfd);
|
||
|
|
|
||
|
|
if (!(j = malloc(sizeof(*j))))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
j->fd = fd;
|
||
|
|
j->c = c;
|
||
|
|
j->s = s;
|
||
|
|
j->h = bfd;
|
||
|
|
|
||
|
|
nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
|
||
|
|
|
||
|
|
int saved_errno = errno;
|
||
|
|
errno = 0;
|
||
|
|
if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
|
||
|
|
j, nonblock)) {
|
||
|
|
/* Would block */
|
||
|
|
if (errno != EAGAIN)
|
||
|
|
return -1;
|
||
|
|
fd->job_pending = j;
|
||
|
|
} else {
|
||
|
|
fd->job_pending = NULL;
|
||
|
|
}
|
||
|
|
errno = saved_errno;
|
||
|
|
|
||
|
|
// flush too
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------
|
||
|
|
* CRAM sequence iterators.
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Converts a cram in-memory record into a bam in-memory record. We
|
||
|
|
* pass a pointer to a bam_seq_t pointer along with the a pointer to
|
||
|
|
* the allocated size. These can initially be pointers to NULL and zero.
|
||
|
|
*
|
||
|
|
* This function will reallocate the bam buffer as required and update
|
||
|
|
* (*bam)->alloc accordingly, allowing it to be used within a loop
|
||
|
|
* efficiently without needing to allocate new bam objects over and
|
||
|
|
* over again.
|
||
|
|
*
|
||
|
|
* Returns the used size of the bam record on success
|
||
|
|
* -1 on failure.
|
||
|
|
*/
|
||
|
|
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
|
||
|
|
cram_record *cr, int rec, bam_seq_t **bam) {
|
||
|
|
int ret, rg_len;
|
||
|
|
char name_a[1024], *name;
|
||
|
|
int name_len;
|
||
|
|
char *aux;
|
||
|
|
char *seq, *qual;
|
||
|
|
sam_hrecs_t *bfd = sh->hrecs;
|
||
|
|
|
||
|
|
/* Assign names if not explicitly set */
|
||
|
|
if (fd->required_fields & SAM_QNAME) {
|
||
|
|
if (cr->name_len) {
|
||
|
|
name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
|
||
|
|
name_len = cr->name_len;
|
||
|
|
} else {
|
||
|
|
name = name_a;
|
||
|
|
if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
|
||
|
|
s->crecs[cr->mate_line].name_len > 0) {
|
||
|
|
// Copy our mate if non-zero.
|
||
|
|
memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
|
||
|
|
s->crecs[cr->mate_line].name_len);
|
||
|
|
name = name_a + s->crecs[cr->mate_line].name_len;
|
||
|
|
} else {
|
||
|
|
// Otherwise generate a name based on prefix
|
||
|
|
name_len = strlen(fd->prefix);
|
||
|
|
memcpy(name, fd->prefix, name_len);
|
||
|
|
name += name_len;
|
||
|
|
*name++ = ':';
|
||
|
|
if (cr->mate_line >= 0 && cr->mate_line < rec) {
|
||
|
|
name = (char *)append_uint64((unsigned char *)name,
|
||
|
|
s->hdr->record_counter +
|
||
|
|
cr->mate_line + 1);
|
||
|
|
} else {
|
||
|
|
name = (char *)append_uint64((unsigned char *)name,
|
||
|
|
s->hdr->record_counter +
|
||
|
|
rec + 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
name_len = name - name_a;
|
||
|
|
name = name_a;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
name = "?";
|
||
|
|
name_len = 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Generate BAM record */
|
||
|
|
if (cr->rg < -1 || cr->rg >= bfd->nrg)
|
||
|
|
return -1;
|
||
|
|
rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
|
||
|
|
|
||
|
|
if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
|
||
|
|
if (!BLOCK_DATA(s->seqs_blk))
|
||
|
|
return -1;
|
||
|
|
seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
|
||
|
|
} else {
|
||
|
|
seq = "*";
|
||
|
|
cr->len = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fd->required_fields & SAM_QUAL) {
|
||
|
|
if (!BLOCK_DATA(s->qual_blk))
|
||
|
|
return -1;
|
||
|
|
qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
|
||
|
|
} else {
|
||
|
|
qual = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
ret = bam_set1(*bam,
|
||
|
|
name_len, name,
|
||
|
|
cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
|
||
|
|
cr->ncigar, &s->cigar[cr->cigar],
|
||
|
|
cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
|
||
|
|
cr->len, seq, qual,
|
||
|
|
cr->aux_size + rg_len);
|
||
|
|
if (ret < 0) {
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
aux = (char *)bam_aux(*bam);
|
||
|
|
|
||
|
|
/* Auxiliary strings */
|
||
|
|
if (cr->aux_size != 0) {
|
||
|
|
memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
|
||
|
|
aux += cr->aux_size;
|
||
|
|
(*bam)->l_data += cr->aux_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* RG:Z: */
|
||
|
|
if (rg_len > 0) {
|
||
|
|
*aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
|
||
|
|
int len = bfd->rg[cr->rg].name_len;
|
||
|
|
memcpy(aux, bfd->rg[cr->rg].name, len);
|
||
|
|
aux += len;
|
||
|
|
*aux++ = 0;
|
||
|
|
(*bam)->l_data += rg_len;
|
||
|
|
}
|
||
|
|
|
||
|
|
return (*bam)->l_data;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Here be dragons! The multi-threading code in this is crufty beyond belief.
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Load first container.
|
||
|
|
* Called when fd->ctr is NULL>
|
||
|
|
*
|
||
|
|
* Returns container on success
|
||
|
|
* NULL on failure.
|
||
|
|
*/
|
||
|
|
static cram_container *cram_first_slice(cram_fd *fd) {
|
||
|
|
cram_container *c;
|
||
|
|
|
||
|
|
do {
|
||
|
|
if (fd->ctr)
|
||
|
|
cram_free_container(fd->ctr);
|
||
|
|
|
||
|
|
if (!(c = fd->ctr = cram_read_container(fd)))
|
||
|
|
return NULL;
|
||
|
|
c->curr_slice_mt = c->curr_slice;
|
||
|
|
} while (c->length == 0);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* The first container may be a result of a sub-range query.
|
||
|
|
* In which case it may still not be the optimal starting point
|
||
|
|
* due to skipped containers/slices in the index.
|
||
|
|
*/
|
||
|
|
// No need for locks here as we're in the main thread.
|
||
|
|
if (fd->range.refid != -2) {
|
||
|
|
while (c->ref_seq_id != -2 &&
|
||
|
|
(c->ref_seq_id < fd->range.refid ||
|
||
|
|
(fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
|
||
|
|
&& c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
|
||
|
|
if (0 != cram_seek(fd, c->length, SEEK_CUR))
|
||
|
|
return NULL;
|
||
|
|
cram_free_container(fd->ctr);
|
||
|
|
do {
|
||
|
|
if (!(c = fd->ctr = cram_read_container(fd)))
|
||
|
|
return NULL;
|
||
|
|
} while (c->length == 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
|
||
|
|
fd->eof = 1;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(c->comp_hdr_block = cram_read_block(fd)))
|
||
|
|
return NULL;
|
||
|
|
if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
|
||
|
|
if (!c->comp_hdr)
|
||
|
|
return NULL;
|
||
|
|
if (!c->comp_hdr->AP_delta &&
|
||
|
|
sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
fd->unsorted = 1;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
|
||
|
|
return c;
|
||
|
|
}
|
||
|
|
|
||
|
|
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
|
||
|
|
cram_container *c_curr; // container being consumed via cram_get_seq()
|
||
|
|
cram_slice *s_curr = NULL;
|
||
|
|
|
||
|
|
// Populate the first container if unknown.
|
||
|
|
if (!(c_curr = fd->ctr)) {
|
||
|
|
if (!(c_curr = cram_first_slice(fd)))
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Discard previous slice
|
||
|
|
if ((s_curr = c_curr->slice)) {
|
||
|
|
c_curr->slice = NULL;
|
||
|
|
cram_free_slice(s_curr);
|
||
|
|
s_curr = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
// If we've consumed all slices in this container, also discard
|
||
|
|
// the container too.
|
||
|
|
if (c_curr->curr_slice == c_curr->max_slice) {
|
||
|
|
if (fd->ctr == c_curr)
|
||
|
|
fd->ctr = NULL;
|
||
|
|
if (fd->ctr_mt == c_curr)
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
cram_free_container(c_curr);
|
||
|
|
c_curr = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!fd->ctr_mt)
|
||
|
|
fd->ctr_mt = c_curr;
|
||
|
|
|
||
|
|
// Fetch the next slice (and the container if necessary).
|
||
|
|
//
|
||
|
|
// If single threaded this loop bails out as soon as it finds
|
||
|
|
// a slice in range. In this case c_next and c_curr end up being
|
||
|
|
// the same thing.
|
||
|
|
//
|
||
|
|
// If multi-threaded, we loop until we have filled out
|
||
|
|
// thread pool input queue. Here c_next and c_curr *may* differ, as
|
||
|
|
// can fd->ctr and fd->ctr_mt.
|
||
|
|
for (;;) {
|
||
|
|
cram_container *c_next = fd->ctr_mt;
|
||
|
|
cram_slice *s_next = NULL;
|
||
|
|
|
||
|
|
// Next slice; either from the last job we failed to push
|
||
|
|
// to the input queue or via more I/O.
|
||
|
|
if (fd->job_pending) {
|
||
|
|
cram_decode_job *j = (cram_decode_job *)fd->job_pending;
|
||
|
|
c_next = j->c;
|
||
|
|
s_next = j->s;
|
||
|
|
free(fd->job_pending);
|
||
|
|
fd->job_pending = NULL;
|
||
|
|
} else if (!fd->ooc) {
|
||
|
|
empty_container:
|
||
|
|
if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
|
||
|
|
// new container
|
||
|
|
for(;;) {
|
||
|
|
if (!(c_next = cram_read_container(fd))) {
|
||
|
|
if (fd->pool) {
|
||
|
|
fd->ooc = 1;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
c_next->curr_slice_mt = c_next->curr_slice;
|
||
|
|
|
||
|
|
if (c_next->length != 0)
|
||
|
|
break;
|
||
|
|
|
||
|
|
cram_free_container(c_next);
|
||
|
|
}
|
||
|
|
if (fd->ooc)
|
||
|
|
break;
|
||
|
|
|
||
|
|
/* Skip containers not yet spanning our range */
|
||
|
|
if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
|
||
|
|
// ref_id beyond end of range; bail out
|
||
|
|
if (c_next->ref_seq_id != fd->range.refid) {
|
||
|
|
cram_free_container(c_next);
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
fd->ooc = 1;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// position beyond end of range; bail out
|
||
|
|
if (fd->range.refid != -1 &&
|
||
|
|
c_next->ref_seq_start > fd->range.end) {
|
||
|
|
cram_free_container(c_next);
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
fd->ooc = 1;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// before start of range; skip to next container
|
||
|
|
if (fd->range.refid != -1 &&
|
||
|
|
c_next->ref_seq_start + c_next->ref_seq_span-1 <
|
||
|
|
fd->range.start) {
|
||
|
|
c_next->curr_slice_mt = c_next->max_slice;
|
||
|
|
cram_seek(fd, c_next->length, SEEK_CUR);
|
||
|
|
cram_free_container(c_next);
|
||
|
|
c_next = NULL;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Container is valid range, so remember it for restarting
|
||
|
|
// this function.
|
||
|
|
fd->ctr_mt = c_next;
|
||
|
|
|
||
|
|
if (!(c_next->comp_hdr_block = cram_read_block(fd)))
|
||
|
|
return NULL;
|
||
|
|
if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
c_next->comp_hdr =
|
||
|
|
cram_decode_compression_header(fd, c_next->comp_hdr_block);
|
||
|
|
if (!c_next->comp_hdr)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
if (!c_next->comp_hdr->AP_delta &&
|
||
|
|
sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
fd->unsorted = 1;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (c_next->num_records == 0) {
|
||
|
|
if (fd->ctr == c_next)
|
||
|
|
fd->ctr = NULL;
|
||
|
|
if (c_curr == c_next)
|
||
|
|
c_curr = NULL;
|
||
|
|
if (fd->ctr_mt == c_next)
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
cram_free_container(c_next);
|
||
|
|
c_next = NULL;
|
||
|
|
goto empty_container;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!(s_next = c_next->slice = cram_read_slice(fd)))
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
s_next->slice_num = ++c_next->curr_slice_mt;
|
||
|
|
s_next->curr_rec = 0;
|
||
|
|
s_next->max_rec = s_next->hdr->num_records;
|
||
|
|
|
||
|
|
s_next->last_apos = s_next->hdr->ref_seq_start;
|
||
|
|
|
||
|
|
// We know the container overlaps our range, but with multi-slice
|
||
|
|
// containers we may have slices that do not. Skip these also.
|
||
|
|
if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
|
||
|
|
// ref_id beyond end of range; bail out
|
||
|
|
if (s_next->hdr->ref_seq_id != fd->range.refid) {
|
||
|
|
fd->ooc = 1;
|
||
|
|
cram_free_slice(s_next);
|
||
|
|
c_next->slice = s_next = NULL;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// position beyond end of range; bail out
|
||
|
|
if (fd->range.refid != -1 &&
|
||
|
|
s_next->hdr->ref_seq_start > fd->range.end) {
|
||
|
|
fd->ooc = 1;
|
||
|
|
cram_free_slice(s_next);
|
||
|
|
c_next->slice = s_next = NULL;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// before start of range; skip to next slice
|
||
|
|
if (fd->range.refid != -1 &&
|
||
|
|
s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
|
||
|
|
fd->range.start) {
|
||
|
|
cram_free_slice(s_next);
|
||
|
|
c_next->slice = s_next = NULL;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} // end: if (!fd->ooc)
|
||
|
|
|
||
|
|
if (!c_next || !s_next)
|
||
|
|
break;
|
||
|
|
|
||
|
|
// Decode the slice, either right now (non-threaded) or by pushing
|
||
|
|
// it to the a decode queue (threaded).
|
||
|
|
if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
|
||
|
|
hts_log_error("Failure to decode slice");
|
||
|
|
cram_free_slice(s_next);
|
||
|
|
c_next->slice = NULL;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
// No thread pool, so don't loop again
|
||
|
|
if (!fd->pool) {
|
||
|
|
c_curr = c_next;
|
||
|
|
s_curr = s_next;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// With thread pool, but we have a job pending so our decode queue
|
||
|
|
// is full.
|
||
|
|
if (fd->job_pending)
|
||
|
|
break;
|
||
|
|
|
||
|
|
// Otherwise we're threaded with room in the decode input queue, so
|
||
|
|
// keep reading slices for decode.
|
||
|
|
// Push it a bit far, to qsize in queue rather than pending arrival,
|
||
|
|
// as cram tends to be a bit bursty in decode timings.
|
||
|
|
if (hts_tpool_process_len(fd->rqueue) >
|
||
|
|
hts_tpool_process_qsize(fd->rqueue))
|
||
|
|
break;
|
||
|
|
} // end of for(;;)
|
||
|
|
|
||
|
|
|
||
|
|
// When not threaded we've already have c_curr and s_curr.
|
||
|
|
// Otherwise we need get them by pulling off the decode output queue.
|
||
|
|
if (fd->pool) {
|
||
|
|
hts_tpool_result *res;
|
||
|
|
cram_decode_job *j;
|
||
|
|
|
||
|
|
if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
|
||
|
|
fd->eof = 1;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
res = hts_tpool_next_result_wait(fd->rqueue);
|
||
|
|
|
||
|
|
if (!res || !hts_tpool_result_data(res)) {
|
||
|
|
hts_log_error("Call to hts_tpool_next_result failed");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
j = (cram_decode_job *)hts_tpool_result_data(res);
|
||
|
|
c_curr = j->c;
|
||
|
|
s_curr = j->s;
|
||
|
|
|
||
|
|
if (j->exit_code != 0) {
|
||
|
|
hts_log_error("Slice decode failure");
|
||
|
|
fd->eof = 0;
|
||
|
|
hts_tpool_delete_result(res, 1);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
hts_tpool_delete_result(res, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
*cp = c_curr;
|
||
|
|
|
||
|
|
// Update current slice being processed (as opposed to current
|
||
|
|
// slice in the multi-threaded reahead.
|
||
|
|
fd->ctr = c_curr;
|
||
|
|
if (c_curr) {
|
||
|
|
c_curr->slice = s_curr;
|
||
|
|
if (s_curr)
|
||
|
|
c_curr->curr_slice = s_curr->slice_num;
|
||
|
|
}
|
||
|
|
if (s_curr)
|
||
|
|
s_curr->curr_rec = 0;
|
||
|
|
else
|
||
|
|
fd->eof = 1;
|
||
|
|
|
||
|
|
return s_curr;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Read the next cram record and return it.
|
||
|
|
* Note that to decode cram_record the caller will need to look up some data
|
||
|
|
* in the current slice, pointed to by fd->ctr->slice. This is valid until
|
||
|
|
* the next call to cram_get_seq (which may invalidate it).
|
||
|
|
*
|
||
|
|
* Returns record pointer on success (do not free)
|
||
|
|
* NULL on failure
|
||
|
|
*/
|
||
|
|
cram_record *cram_get_seq(cram_fd *fd) {
|
||
|
|
cram_container *c;
|
||
|
|
cram_slice *s;
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
c = fd->ctr;
|
||
|
|
if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
|
||
|
|
s = c->slice;
|
||
|
|
} else {
|
||
|
|
if (!(s = cram_next_slice(fd, &c)))
|
||
|
|
return NULL;
|
||
|
|
continue; /* In case slice contains no records */
|
||
|
|
}
|
||
|
|
|
||
|
|
// No need to lock here as get_seq is running in the main thread,
|
||
|
|
// which is also the same one that does the range modifications.
|
||
|
|
if (fd->range.refid != -2) {
|
||
|
|
if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
|
||
|
|
// Special case when looking for unmapped blocks at end.
|
||
|
|
// If these are mixed in with mapped data (c->ref_id == -2)
|
||
|
|
// then we need skip until we find the unmapped data, if at all
|
||
|
|
s->curr_rec++;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
|
||
|
|
s->crecs[s->curr_rec].ref_id != -1) {
|
||
|
|
// Looking for a mapped read, but not there yet. Special case
|
||
|
|
// as -1 (unmapped) shouldn't be considered < refid.
|
||
|
|
s->curr_rec++;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
|
||
|
|
fd->eof = 1;
|
||
|
|
cram_free_slice(s);
|
||
|
|
c->slice = NULL;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
|
||
|
|
fd->eof = 1;
|
||
|
|
cram_free_slice(s);
|
||
|
|
c->slice = NULL;
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
|
||
|
|
s->curr_rec++;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
fd->ctr = c;
|
||
|
|
c->slice = s;
|
||
|
|
return &s->crecs[s->curr_rec++];
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Read the next cram record and convert it to a bam_seq_t struct.
|
||
|
|
*
|
||
|
|
* Returns >= 0 success (number of bytes written to *bam)
|
||
|
|
* -1 on EOF or failure (check fd->err)
|
||
|
|
*/
|
||
|
|
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
|
||
|
|
cram_record *cr;
|
||
|
|
cram_container *c;
|
||
|
|
cram_slice *s;
|
||
|
|
|
||
|
|
if (!(cr = cram_get_seq(fd)))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
c = fd->ctr;
|
||
|
|
s = c->slice;
|
||
|
|
|
||
|
|
return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Drains and frees the decode read-queue for a multi-threaded reader.
|
||
|
|
*/
|
||
|
|
void cram_drain_rqueue(cram_fd *fd) {
|
||
|
|
cram_container *lc = NULL;
|
||
|
|
|
||
|
|
if (!fd->pool || !fd->rqueue)
|
||
|
|
return;
|
||
|
|
|
||
|
|
// drain queue of any in-flight decode jobs
|
||
|
|
while (!hts_tpool_process_empty(fd->rqueue)) {
|
||
|
|
hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
|
||
|
|
if (!r)
|
||
|
|
break;
|
||
|
|
cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
|
||
|
|
if (j->c->slice == j->s)
|
||
|
|
j->c->slice = NULL;
|
||
|
|
if (j->c != lc) {
|
||
|
|
if (lc) {
|
||
|
|
if (fd->ctr == lc)
|
||
|
|
fd->ctr = NULL;
|
||
|
|
if (fd->ctr_mt == lc)
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
cram_free_container(lc);
|
||
|
|
}
|
||
|
|
lc = j->c;
|
||
|
|
}
|
||
|
|
cram_free_slice(j->s);
|
||
|
|
hts_tpool_delete_result(r, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Also tidy up any pending decode job that we didn't submit to the workers
|
||
|
|
// due to the input queue being full.
|
||
|
|
if (fd->job_pending) {
|
||
|
|
cram_decode_job *j = (cram_decode_job *)fd->job_pending;
|
||
|
|
if (j->c->slice == j->s)
|
||
|
|
j->c->slice = NULL;
|
||
|
|
if (j->c != lc) {
|
||
|
|
if (lc) {
|
||
|
|
if (fd->ctr == lc)
|
||
|
|
fd->ctr = NULL;
|
||
|
|
if (fd->ctr_mt == lc)
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
cram_free_container(lc);
|
||
|
|
}
|
||
|
|
lc = j->c;
|
||
|
|
}
|
||
|
|
cram_free_slice(j->s);
|
||
|
|
free(j);
|
||
|
|
fd->job_pending = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (lc) {
|
||
|
|
if (fd->ctr == lc)
|
||
|
|
fd->ctr = NULL;
|
||
|
|
if (fd->ctr_mt == lc)
|
||
|
|
fd->ctr_mt = NULL;
|
||
|
|
cram_free_container(lc);
|
||
|
|
}
|
||
|
|
}
|