4181 lines
146 KiB
C
4181 lines
146 KiB
C
|
|
/*
|
||
|
|
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
|
||
|
|
Author: James Bonfield <jkb@sanger.ac.uk>
|
||
|
|
|
||
|
|
Redistribution and use in source and binary forms, with or without
|
||
|
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
|
||
|
|
1. Redistributions of source code must retain the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer.
|
||
|
|
|
||
|
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer in the documentation
|
||
|
|
and/or other materials provided with the distribution.
|
||
|
|
|
||
|
|
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
||
|
|
Institute nor the names of its contributors may be used to endorse or promote
|
||
|
|
products derived from this software without specific prior written permission.
|
||
|
|
|
||
|
|
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
||
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
|
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
||
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
|
*/
|
||
|
|
|
||
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
||
|
|
#include <config.h>
|
||
|
|
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <errno.h>
|
||
|
|
#include <assert.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include <strings.h>
|
||
|
|
#include <zlib.h>
|
||
|
|
#include <sys/types.h>
|
||
|
|
#include <sys/stat.h>
|
||
|
|
#include <math.h>
|
||
|
|
#include <inttypes.h>
|
||
|
|
#include <ctype.h>
|
||
|
|
|
||
|
|
#include "cram.h"
|
||
|
|
#include "os.h"
|
||
|
|
#include "../sam_internal.h" // for nibble2base
|
||
|
|
#include "../htslib/hts.h"
|
||
|
|
#include "../htslib/hts_endian.h"
|
||
|
|
#include "../textutils_internal.h"
|
||
|
|
|
||
|
|
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
|
||
|
|
|
||
|
|
#define Z_CRAM_STRAT Z_FILTERED
|
||
|
|
//#define Z_CRAM_STRAT Z_RLE
|
||
|
|
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
|
||
|
|
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
|
||
|
|
|
||
|
|
static int process_one_read(cram_fd *fd, cram_container *c,
|
||
|
|
cram_slice *s, cram_record *cr,
|
||
|
|
bam_seq_t *b, int rnum, kstring_t *MD,
|
||
|
|
int embed_ref, int no_ref);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Returns index of val into key.
|
||
|
|
* Basically strchr(key, val)-key;
|
||
|
|
*/
|
||
|
|
static int sub_idx(char *key, char val) {
|
||
|
|
int i;
|
||
|
|
|
||
|
|
for (i = 0; i < 4 && *key++ != val; i++);
|
||
|
|
return i;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Encodes a compression header block into a generic cram_block structure.
|
||
|
|
*
|
||
|
|
* Returns cram_block ptr on success
|
||
|
|
* NULL on failure
|
||
|
|
*/
|
||
|
|
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
|
||
|
|
cram_block_compression_hdr *h,
|
||
|
|
int embed_ref) {
|
||
|
|
cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
|
||
|
|
cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
|
||
|
|
int i, mc, r = 0;
|
||
|
|
|
||
|
|
int no_ref = c->no_ref;
|
||
|
|
|
||
|
|
if (!cb || !map)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This is a concatenation of several blocks of data:
|
||
|
|
* header + landmarks, preservation map, read encoding map, and the tag
|
||
|
|
* encoding map.
|
||
|
|
* All 4 are variable sized and we need to know how large these are
|
||
|
|
* before creating the compression header itself as this starts with
|
||
|
|
* the total size (stored as a variable length string).
|
||
|
|
*/
|
||
|
|
|
||
|
|
// Duplicated from container itself, and removed in 1.1
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
r |= itf8_put_blk(cb, h->ref_seq_id);
|
||
|
|
r |= itf8_put_blk(cb, h->ref_seq_start);
|
||
|
|
r |= itf8_put_blk(cb, h->ref_seq_span);
|
||
|
|
r |= itf8_put_blk(cb, h->num_records);
|
||
|
|
r |= itf8_put_blk(cb, h->num_landmarks);
|
||
|
|
for (i = 0; i < h->num_landmarks; i++) {
|
||
|
|
r |= itf8_put_blk(cb, h->landmark[i]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (h->preservation_map) {
|
||
|
|
kh_destroy(map, h->preservation_map);
|
||
|
|
h->preservation_map = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Create in-memory preservation map */
|
||
|
|
/* FIXME: should create this when we create the container */
|
||
|
|
if (c->num_records > 0) {
|
||
|
|
khint_t k;
|
||
|
|
int r;
|
||
|
|
|
||
|
|
if (!(h->preservation_map = kh_init(map)))
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
k = kh_put(map, h->preservation_map, "RN", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
k = kh_put(map, h->preservation_map, "PI", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = 0;
|
||
|
|
|
||
|
|
k = kh_put(map, h->preservation_map, "UI", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = 1;
|
||
|
|
|
||
|
|
k = kh_put(map, h->preservation_map, "MI", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = 1;
|
||
|
|
|
||
|
|
} else {
|
||
|
|
// Technically SM was in 1.0, but wasn't in Java impl.
|
||
|
|
k = kh_put(map, h->preservation_map, "SM", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = 0;
|
||
|
|
|
||
|
|
k = kh_put(map, h->preservation_map, "TD", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = 0;
|
||
|
|
|
||
|
|
k = kh_put(map, h->preservation_map, "AP", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = h->AP_delta;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
k = kh_put(map, h->preservation_map, "QO", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = h->qs_seq_orient;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (no_ref || embed_ref>0) {
|
||
|
|
// Reference Required == No
|
||
|
|
k = kh_put(map, h->preservation_map, "RR", &r);
|
||
|
|
if (-1 == r) return NULL;
|
||
|
|
kh_val(h->preservation_map, k).i = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Encode preservation map; could collapse this and above into one */
|
||
|
|
mc = 0;
|
||
|
|
BLOCK_SIZE(map) = 0;
|
||
|
|
if (h->preservation_map) {
|
||
|
|
khint_t k;
|
||
|
|
|
||
|
|
for (k = kh_begin(h->preservation_map);
|
||
|
|
k != kh_end(h->preservation_map);
|
||
|
|
k++) {
|
||
|
|
const char *key;
|
||
|
|
khash_t(map) *pmap = h->preservation_map;
|
||
|
|
|
||
|
|
|
||
|
|
if (!kh_exist(pmap, k))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
key = kh_key(pmap, k);
|
||
|
|
BLOCK_APPEND(map, key, 2);
|
||
|
|
|
||
|
|
switch(CRAM_KEY(key[0], key[1])) {
|
||
|
|
case CRAM_KEY('M','I'):
|
||
|
|
case CRAM_KEY('U','I'):
|
||
|
|
case CRAM_KEY('P','I'):
|
||
|
|
case CRAM_KEY('A','P'):
|
||
|
|
case CRAM_KEY('R','N'):
|
||
|
|
case CRAM_KEY('R','R'):
|
||
|
|
case CRAM_KEY('Q','O'):
|
||
|
|
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case CRAM_KEY('S','M'): {
|
||
|
|
char smat[5], *mp = smat;
|
||
|
|
// Output format is for order ACGTN (minus ref base)
|
||
|
|
// to store the code value 0-3 for each symbol.
|
||
|
|
//
|
||
|
|
// Note this is different to storing the symbols in order
|
||
|
|
// that the codes occur from 0-3, which is what we used to
|
||
|
|
// do. (It didn't matter as we always had a fixed table in
|
||
|
|
// the order.)
|
||
|
|
*mp++ =
|
||
|
|
(sub_idx(h->substitution_matrix[0], 'C') << 6) |
|
||
|
|
(sub_idx(h->substitution_matrix[0], 'G') << 4) |
|
||
|
|
(sub_idx(h->substitution_matrix[0], 'T') << 2) |
|
||
|
|
(sub_idx(h->substitution_matrix[0], 'N') << 0);
|
||
|
|
*mp++ =
|
||
|
|
(sub_idx(h->substitution_matrix[1], 'A') << 6) |
|
||
|
|
(sub_idx(h->substitution_matrix[1], 'G') << 4) |
|
||
|
|
(sub_idx(h->substitution_matrix[1], 'T') << 2) |
|
||
|
|
(sub_idx(h->substitution_matrix[1], 'N') << 0);
|
||
|
|
*mp++ =
|
||
|
|
(sub_idx(h->substitution_matrix[2], 'A') << 6) |
|
||
|
|
(sub_idx(h->substitution_matrix[2], 'C') << 4) |
|
||
|
|
(sub_idx(h->substitution_matrix[2], 'T') << 2) |
|
||
|
|
(sub_idx(h->substitution_matrix[2], 'N') << 0);
|
||
|
|
*mp++ =
|
||
|
|
(sub_idx(h->substitution_matrix[3], 'A') << 6) |
|
||
|
|
(sub_idx(h->substitution_matrix[3], 'C') << 4) |
|
||
|
|
(sub_idx(h->substitution_matrix[3], 'G') << 2) |
|
||
|
|
(sub_idx(h->substitution_matrix[3], 'N') << 0);
|
||
|
|
*mp++ =
|
||
|
|
(sub_idx(h->substitution_matrix[4], 'A') << 6) |
|
||
|
|
(sub_idx(h->substitution_matrix[4], 'C') << 4) |
|
||
|
|
(sub_idx(h->substitution_matrix[4], 'G') << 2) |
|
||
|
|
(sub_idx(h->substitution_matrix[4], 'T') << 0);
|
||
|
|
BLOCK_APPEND(map, smat, 5);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case CRAM_KEY('T','D'): {
|
||
|
|
r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
|
||
|
|
BLOCK_APPEND(map,
|
||
|
|
BLOCK_DATA(h->TD_blk),
|
||
|
|
BLOCK_SIZE(h->TD_blk));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_warning("Unknown preservation key '%.2s'", key);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
|
||
|
|
r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
|
||
|
|
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
|
||
|
|
|
||
|
|
/* rec encoding map */
|
||
|
|
mc = 0;
|
||
|
|
BLOCK_SIZE(map) = 0;
|
||
|
|
if (h->codecs[DS_BF]) {
|
||
|
|
if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_CF]) {
|
||
|
|
if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_RL]) {
|
||
|
|
if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_AP]) {
|
||
|
|
if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_RG]) {
|
||
|
|
if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_MF]) {
|
||
|
|
if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_NS]) {
|
||
|
|
if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_NP]) {
|
||
|
|
if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_TS]) {
|
||
|
|
if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_NF]) {
|
||
|
|
if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_TC]) {
|
||
|
|
if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_TN]) {
|
||
|
|
if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_TL]) {
|
||
|
|
if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_FN]) {
|
||
|
|
if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_FC]) {
|
||
|
|
if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_FP]) {
|
||
|
|
if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_BS]) {
|
||
|
|
if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_IN]) {
|
||
|
|
if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_DL]) {
|
||
|
|
if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_BA]) {
|
||
|
|
if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_BB]) {
|
||
|
|
if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_MQ]) {
|
||
|
|
if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_RN]) {
|
||
|
|
if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_QS]) {
|
||
|
|
if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_QQ]) {
|
||
|
|
if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_RI]) {
|
||
|
|
if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1) {
|
||
|
|
if (h->codecs[DS_SC]) {
|
||
|
|
if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_RS]) {
|
||
|
|
if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_PD]) {
|
||
|
|
if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_HC]) {
|
||
|
|
if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_TM]) {
|
||
|
|
if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
if (h->codecs[DS_TV]) {
|
||
|
|
if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
|
||
|
|
fd->version))
|
||
|
|
return NULL;
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
|
||
|
|
r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
|
||
|
|
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
|
||
|
|
|
||
|
|
/* tag encoding map */
|
||
|
|
mc = 0;
|
||
|
|
BLOCK_SIZE(map) = 0;
|
||
|
|
if (c->tags_used) {
|
||
|
|
khint_t k;
|
||
|
|
|
||
|
|
for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
|
||
|
|
int key;
|
||
|
|
if (!kh_exist(c->tags_used, k))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
key = kh_key(c->tags_used, k);
|
||
|
|
cram_codec *cd = kh_val(c->tags_used, k)->codec;
|
||
|
|
|
||
|
|
r |= (fd->vv.varint_put32_blk(map, key) <= 0);
|
||
|
|
if (-1 == cd->store(cd, map, NULL, fd->version))
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
mc++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
|
||
|
|
r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
|
||
|
|
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
|
||
|
|
|
||
|
|
hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
|
||
|
|
|
||
|
|
BLOCK_UPLEN(cb);
|
||
|
|
|
||
|
|
cram_free_block(map);
|
||
|
|
|
||
|
|
if (r >= 0)
|
||
|
|
return cb;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Encodes a slice compression header.
|
||
|
|
*
|
||
|
|
* Returns cram_block on success
|
||
|
|
* NULL on failure
|
||
|
|
*/
|
||
|
|
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
|
||
|
|
char *buf;
|
||
|
|
char *cp;
|
||
|
|
cram_block *b = cram_new_block(MAPPED_SLICE, 0);
|
||
|
|
int j;
|
||
|
|
|
||
|
|
if (!b)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
|
||
|
|
if (NULL == buf) {
|
||
|
|
cram_free_block(b);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
|
||
|
|
cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
|
||
|
|
} else {
|
||
|
|
if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
|
||
|
|
hts_log_error("Reference position too large for CRAM 3");
|
||
|
|
cram_free_block(b);
|
||
|
|
free(buf);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
|
||
|
|
}
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 2)
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
|
||
|
|
else if (CRAM_MAJOR_VERS(fd->version) >= 3)
|
||
|
|
cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
|
||
|
|
for (j = 0; j < s->hdr->num_content_ids; j++) {
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
|
||
|
|
}
|
||
|
|
if (s->hdr->content_type == MAPPED_SLICE)
|
||
|
|
cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1) {
|
||
|
|
memcpy(cp, s->hdr->md5, 16); cp += 16;
|
||
|
|
}
|
||
|
|
|
||
|
|
assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
|
||
|
|
|
||
|
|
b->data = (unsigned char *)buf;
|
||
|
|
b->comp_size = b->uncomp_size = cp-buf;
|
||
|
|
|
||
|
|
return b;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Encodes a single read.
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
static int cram_encode_slice_read(cram_fd *fd,
|
||
|
|
cram_container *c,
|
||
|
|
cram_block_compression_hdr *h,
|
||
|
|
cram_slice *s,
|
||
|
|
cram_record *cr,
|
||
|
|
int64_t *last_pos) {
|
||
|
|
int r = 0;
|
||
|
|
int32_t i32;
|
||
|
|
int64_t i64;
|
||
|
|
unsigned char uc;
|
||
|
|
|
||
|
|
//fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
|
||
|
|
|
||
|
|
//printf("BF=0x%x\n", cr->flags);
|
||
|
|
// bf = cram_flag_swap[cr->flags];
|
||
|
|
i32 = fd->cram_flag_swap[cr->flags & 0xfff];
|
||
|
|
r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
|
||
|
|
|
||
|
|
i32 = cr->cram_flags & CRAM_FLAG_MASK;
|
||
|
|
r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
|
||
|
|
r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
|
||
|
|
|
||
|
|
r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
|
||
|
|
|
||
|
|
if (c->pos_sorted) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
i64 = cr->apos - *last_pos;
|
||
|
|
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
|
||
|
|
} else {
|
||
|
|
i32 = cr->apos - *last_pos;
|
||
|
|
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
|
||
|
|
}
|
||
|
|
*last_pos = cr->apos;
|
||
|
|
} else {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
i64 = cr->apos;
|
||
|
|
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
|
||
|
|
} else {
|
||
|
|
i32 = cr->apos;
|
||
|
|
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
|
||
|
|
|
||
|
|
if (cr->cram_flags & CRAM_FLAG_DETACHED) {
|
||
|
|
i32 = cr->mate_flags;
|
||
|
|
r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
|
||
|
|
|
||
|
|
r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
|
||
|
|
(char *)&cr->mate_ref_id, 1);
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
|
||
|
|
(char *)&cr->mate_pos, 1);
|
||
|
|
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
|
||
|
|
(char *)&cr->tlen, 1);
|
||
|
|
} else {
|
||
|
|
i32 = cr->mate_pos;
|
||
|
|
r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
|
||
|
|
(char *)&i32, 1);
|
||
|
|
i32 = cr->tlen;
|
||
|
|
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
|
||
|
|
(char *)&i32, 1);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
|
||
|
|
r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
|
||
|
|
(char *)&cr->mate_line, 1);
|
||
|
|
}
|
||
|
|
if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
|
||
|
|
(char *)&cr->tlen, 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Aux tags */
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
int j;
|
||
|
|
uc = cr->ntags;
|
||
|
|
r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
|
||
|
|
|
||
|
|
for (j = 0; j < cr->ntags; j++) {
|
||
|
|
uint32_t i32 = s->TN[cr->TN_idx + j]; // id
|
||
|
|
r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// qual
|
||
|
|
// QS codec : Already stored in block[2].
|
||
|
|
|
||
|
|
// features (diffs)
|
||
|
|
if (!(cr->flags & BAM_FUNMAP)) {
|
||
|
|
int prev_pos = 0, j;
|
||
|
|
|
||
|
|
r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
|
||
|
|
(char *)&cr->nfeature, 1);
|
||
|
|
for (j = 0; j < cr->nfeature; j++) {
|
||
|
|
cram_feature *f = &s->features[cr->feature + j];
|
||
|
|
|
||
|
|
uc = f->X.code;
|
||
|
|
r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
|
||
|
|
i32 = f->X.pos - prev_pos;
|
||
|
|
r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
|
||
|
|
prev_pos = f->X.pos;
|
||
|
|
|
||
|
|
switch(f->X.code) {
|
||
|
|
//char *seq;
|
||
|
|
|
||
|
|
case 'X':
|
||
|
|
//fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
|
||
|
|
|
||
|
|
uc = f->X.base;
|
||
|
|
r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
|
||
|
|
(char *)&uc, 1);
|
||
|
|
break;
|
||
|
|
case 'S':
|
||
|
|
// Already done
|
||
|
|
//r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
|
||
|
|
// BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
|
||
|
|
// f->S.len);
|
||
|
|
|
||
|
|
//if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
||
|
|
// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
|
||
|
|
// BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
|
||
|
|
// f->S.len);
|
||
|
|
//}
|
||
|
|
break;
|
||
|
|
case 'I':
|
||
|
|
//seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
|
||
|
|
//r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
|
||
|
|
// seq, f->S.len);
|
||
|
|
//if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
||
|
|
// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
|
||
|
|
// BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
|
||
|
|
// f->I.len);
|
||
|
|
//}
|
||
|
|
break;
|
||
|
|
case 'i':
|
||
|
|
uc = f->i.base;
|
||
|
|
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
|
||
|
|
(char *)&uc, 1);
|
||
|
|
//seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
|
||
|
|
//r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
|
||
|
|
// seq, 1);
|
||
|
|
break;
|
||
|
|
case 'D':
|
||
|
|
i32 = f->D.len;
|
||
|
|
r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
|
||
|
|
(char *)&i32, 1);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'B':
|
||
|
|
// // Used when we try to store a non ACGTN base or an N
|
||
|
|
// // that aligns against a non ACGTN reference
|
||
|
|
|
||
|
|
uc = f->B.base;
|
||
|
|
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
|
||
|
|
(char *)&uc, 1);
|
||
|
|
|
||
|
|
// Already added
|
||
|
|
// uc = f->B.qual;
|
||
|
|
// r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
|
||
|
|
// (char *)&uc, 1);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'b':
|
||
|
|
// string of bases
|
||
|
|
r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
|
||
|
|
(char *)BLOCK_DATA(s->seqs_blk)
|
||
|
|
+ f->b.seq_idx,
|
||
|
|
f->b.len);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'Q':
|
||
|
|
// Already added
|
||
|
|
// uc = f->B.qual;
|
||
|
|
// r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
|
||
|
|
// (char *)&uc, 1);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'N':
|
||
|
|
i32 = f->N.len;
|
||
|
|
r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
|
||
|
|
(char *)&i32, 1);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'P':
|
||
|
|
i32 = f->P.len;
|
||
|
|
r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
|
||
|
|
(char *)&i32, 1);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'H':
|
||
|
|
i32 = f->H.len;
|
||
|
|
r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
|
||
|
|
(char *)&i32, 1);
|
||
|
|
break;
|
||
|
|
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_error("Unhandled feature code %c", f->X.code);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
|
||
|
|
(char *)&cr->mqual, 1);
|
||
|
|
} else {
|
||
|
|
char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
|
||
|
|
if (cr->len)
|
||
|
|
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
|
||
|
|
}
|
||
|
|
|
||
|
|
return r ? -1 : 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Applies various compression methods to specific blocks, depending on
|
||
|
|
* known observations of how data series compress.
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
|
||
|
|
int level = fd->level, i;
|
||
|
|
int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
|
||
|
|
int v31_or_above = (fd->version >= (3<<8)+1);
|
||
|
|
|
||
|
|
/* Compress the CORE Block too, with minimal zlib level */
|
||
|
|
if (level > 5 && s->block[0]->uncomp_size > 500)
|
||
|
|
cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
|
||
|
|
|
||
|
|
if (fd->use_bz2)
|
||
|
|
method |= 1<<BZIP2;
|
||
|
|
|
||
|
|
int method_rans = (1<<RANS0) | (1<<RANS1);
|
||
|
|
int method_ranspr = method_rans;
|
||
|
|
|
||
|
|
if (fd->use_rans) {
|
||
|
|
method_ranspr = (1<<RANS_PR0) | (1<<RANS_PR1);
|
||
|
|
if (level > 1)
|
||
|
|
method_ranspr |=
|
||
|
|
(1<<RANS_PR64) | (1<<RANS_PR9)
|
||
|
|
| (1<<RANS_PR128) | (1<<RANS_PR193);
|
||
|
|
if (level > 5)
|
||
|
|
method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fd->use_rans) {
|
||
|
|
methodF |= v31_or_above ? method_ranspr : method_rans;
|
||
|
|
method |= v31_or_above ? method_ranspr : method_rans;
|
||
|
|
}
|
||
|
|
|
||
|
|
int method_arith = 0;
|
||
|
|
if (fd->use_arith) {
|
||
|
|
method_arith = (1<<ARITH_PR0) | (1<<ARITH_PR1);
|
||
|
|
if (level > 1)
|
||
|
|
method_arith |=
|
||
|
|
(1<<ARITH_PR64) | (1<<ARITH_PR9)
|
||
|
|
| (1<<ARITH_PR128) | (1<<ARITH_PR129)
|
||
|
|
| (1<<ARITH_PR192) | (1u<<ARITH_PR193);
|
||
|
|
}
|
||
|
|
if (fd->use_arith && v31_or_above) {
|
||
|
|
methodF |= method_arith;
|
||
|
|
method |= method_arith;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (fd->use_lzma)
|
||
|
|
method |= (1<<LZMA);
|
||
|
|
|
||
|
|
/* Faster method for data series we only need entropy encoding on */
|
||
|
|
methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
|
||
|
|
if (level >= 5) {
|
||
|
|
method |= 1<<GZIP_1;
|
||
|
|
methodF = method;
|
||
|
|
}
|
||
|
|
if (level == 1) {
|
||
|
|
method &= ~(1<<GZIP);
|
||
|
|
method |= 1<<GZIP_1;
|
||
|
|
methodF = method;
|
||
|
|
}
|
||
|
|
|
||
|
|
int qmethod = method;
|
||
|
|
int qmethodF = method;
|
||
|
|
if (v31_or_above && fd->use_fqz) {
|
||
|
|
qmethod |= 1<<FQZ;
|
||
|
|
qmethodF |= 1<<FQZ;
|
||
|
|
if (fd->level > 4) {
|
||
|
|
qmethod |= 1<<FQZ_b;
|
||
|
|
qmethodF |= 1<<FQZ_b;
|
||
|
|
}
|
||
|
|
if (fd->level > 6) {
|
||
|
|
qmethod |= (1<<FQZ_c) | (1<<FQZ_d);
|
||
|
|
qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->metrics_lock);
|
||
|
|
for (i = 0; i < DS_END; i++)
|
||
|
|
if (c->stats[i] && c->stats[i]->nvals > 16)
|
||
|
|
fd->m[i]->unpackable = 1;
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
|
||
|
|
/* Specific compression methods for certain block types */
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (fd->level == 0) {
|
||
|
|
/* Do nothing */
|
||
|
|
} else if (fd->level == 1) {
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
|
||
|
|
qmethodF, 1))
|
||
|
|
return -1;
|
||
|
|
for (i = DS_aux; i <= DS_aux_oz; i++) {
|
||
|
|
if (s->block[i])
|
||
|
|
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
|
||
|
|
method, 1))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
} else if (fd->level < 3) {
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
|
||
|
|
qmethod, 1))
|
||
|
|
return -1;
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
|
||
|
|
method, 1))
|
||
|
|
return -1;
|
||
|
|
if (s->block[DS_BB])
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
|
||
|
|
method, 1))
|
||
|
|
return -1;
|
||
|
|
for (i = DS_aux; i <= DS_aux_oz; i++) {
|
||
|
|
if (s->block[i])
|
||
|
|
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
|
||
|
|
qmethod, level))
|
||
|
|
return -1;
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
if (s->block[DS_BB])
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
for (i = DS_aux; i <= DS_aux_oz; i++) {
|
||
|
|
if (s->block[i])
|
||
|
|
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// NAME: best is generally xz, bzip2, zlib then rans1
|
||
|
|
int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
|
||
|
|
if (fd->version >= (3<<8)+1 && fd->use_tok)
|
||
|
|
method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
|
||
|
|
method_rn, level))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// NS shows strong local correlation as rearrangements are localised
|
||
|
|
if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
|
||
|
|
if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Compress any auxiliary tags with their own per-tag metrics
|
||
|
|
*/
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
|
||
|
|
if (!s->block[i] || s->block[i] == s->block[0])
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (s->block[i]->method != RAW)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
|
||
|
|
method, level))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Minimal compression of any block still uncompressed, bar CORE
|
||
|
|
*/
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
|
||
|
|
if (!s->block[i] || s->block[i] == s->block[0])
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (s->block[i]->method != RAW)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
|
||
|
|
methodF, level))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Allocates a block associated with the cram codec associated with
|
||
|
|
* data series ds_id or the internal codec_id (depending on codec
|
||
|
|
* type).
|
||
|
|
*
|
||
|
|
* The ds_ids are what end up written to disk as an external block.
|
||
|
|
* The c_ids are internal and used when daisy-chaining transforms
|
||
|
|
* such as MAP and RLE. These blocks are also allocated, but
|
||
|
|
* are ephemeral in nature. (The codecs themselves cannot allocate
|
||
|
|
* these as the same codec pointer may be operating on multiple slices
|
||
|
|
* if we're using a multi-slice container.)
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
|
||
|
|
if (!codec)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
switch(codec->codec) {
|
||
|
|
// Codecs which are hard-coded to use the CORE block
|
||
|
|
case E_GOLOMB:
|
||
|
|
case E_HUFFMAN:
|
||
|
|
case E_BETA:
|
||
|
|
case E_SUBEXP:
|
||
|
|
case E_GOLOMB_RICE:
|
||
|
|
case E_GAMMA:
|
||
|
|
codec->out = s->block[0];
|
||
|
|
break;
|
||
|
|
|
||
|
|
// Codecs which don't use external blocks
|
||
|
|
case E_CONST_BYTE:
|
||
|
|
case E_CONST_INT:
|
||
|
|
codec->out = NULL;
|
||
|
|
break;
|
||
|
|
|
||
|
|
// Codecs that emit directly to external blocks
|
||
|
|
case E_EXTERNAL:
|
||
|
|
case E_VARINT_UNSIGNED:
|
||
|
|
case E_VARINT_SIGNED:
|
||
|
|
if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
|
||
|
|
return -1;
|
||
|
|
codec->u.external.content_id = ds_id;
|
||
|
|
codec->out = s->block[ds_id];
|
||
|
|
break;
|
||
|
|
|
||
|
|
case E_BYTE_ARRAY_STOP: // Why no sub-codec?
|
||
|
|
if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
|
||
|
|
return -1;
|
||
|
|
codec->u.byte_array_stop.content_id = ds_id;
|
||
|
|
codec->out = s->block[ds_id];
|
||
|
|
break;
|
||
|
|
|
||
|
|
|
||
|
|
// Codecs that contain sub-codecs which may in turn emit to external blocks
|
||
|
|
case E_BYTE_ARRAY_LEN: {
|
||
|
|
cram_codec *bal = codec->u.e_byte_array_len.len_codec;
|
||
|
|
if (cram_allocate_block(bal, s, bal->u.external.content_id))
|
||
|
|
return -1;
|
||
|
|
bal = codec->u.e_byte_array_len.val_codec;
|
||
|
|
if (cram_allocate_block(bal, s, bal->u.external.content_id))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case E_XRLE:
|
||
|
|
if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
|
||
|
|
//ds_id == DS_QS ? DS_QS_len : ds_id))
|
||
|
|
return -1;
|
||
|
|
if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
break;
|
||
|
|
|
||
|
|
case E_XPACK:
|
||
|
|
if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
|
||
|
|
return -1;
|
||
|
|
codec->out = cram_new_block(0, 0); // ephemeral
|
||
|
|
if (!codec->out)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
break;
|
||
|
|
|
||
|
|
case E_XDELTA:
|
||
|
|
if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
|
||
|
|
return -1;
|
||
|
|
codec->out = cram_new_block(0, 0); // ephemeral
|
||
|
|
if (!codec->out)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Encodes a single slice from a container
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
static int cram_encode_slice(cram_fd *fd, cram_container *c,
|
||
|
|
cram_block_compression_hdr *h, cram_slice *s,
|
||
|
|
int embed_ref) {
|
||
|
|
int rec, r = 0;
|
||
|
|
int64_t last_pos;
|
||
|
|
enum cram_DS_ID id;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Slice external blocks:
|
||
|
|
* ID 0 => base calls (insertions, soft-clip)
|
||
|
|
* ID 1 => qualities
|
||
|
|
* ID 2 => names
|
||
|
|
* ID 3 => TS (insert size), NP (next frag)
|
||
|
|
* ID 4 => tag values
|
||
|
|
* ID 6 => tag IDs (TN), if CRAM_V1.0
|
||
|
|
* ID 7 => TD tag dictionary, if !CRAM_V1.0
|
||
|
|
*/
|
||
|
|
|
||
|
|
/* Create cram slice header */
|
||
|
|
s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
|
||
|
|
? DS_ref
|
||
|
|
: (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
|
||
|
|
s->hdr->record_counter = c->num_records + c->record_counter;
|
||
|
|
c->num_records += s->hdr->num_records;
|
||
|
|
|
||
|
|
int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
|
||
|
|
s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
|
||
|
|
s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
|
||
|
|
if (!s->block || !s->hdr->block_content_ids)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// Create first fixed blocks, always external.
|
||
|
|
// CORE
|
||
|
|
if (!(s->block[0] = cram_new_block(CORE, 0)))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// TN block for CRAM v1
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
|
||
|
|
if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
|
||
|
|
h->codecs[DS_TN]->u.external.content_id = DS_TN;
|
||
|
|
} else {
|
||
|
|
s->block[DS_TN] = s->block[0];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Embedded reference
|
||
|
|
if (embed_ref>0) {
|
||
|
|
if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
|
||
|
|
return -1;
|
||
|
|
s->ref_id = DS_ref; // needed?
|
||
|
|
BLOCK_APPEND(s->block[DS_ref],
|
||
|
|
c->ref + s->hdr->ref_seq_start - c->ref_start,
|
||
|
|
s->hdr->ref_seq_span);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* All the data-series blocks if appropriate.
|
||
|
|
*/
|
||
|
|
for (id = DS_QS; id < DS_TN; id++) {
|
||
|
|
if (cram_allocate_block(h->codecs[id], s, id) < 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Add in the external tag blocks too.
|
||
|
|
*/
|
||
|
|
if (c->tags_used) {
|
||
|
|
int n;
|
||
|
|
s->hdr->num_blocks = DS_END;
|
||
|
|
for (n = 0; n < s->naux_block; n++) {
|
||
|
|
s->block[s->hdr->num_blocks++] = s->aux_block[n];
|
||
|
|
s->aux_block[n] = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Encode reads */
|
||
|
|
last_pos = s->hdr->ref_seq_start;
|
||
|
|
for (rec = 0; rec < s->hdr->num_records; rec++) {
|
||
|
|
cram_record *cr = &s->crecs[rec];
|
||
|
|
if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
|
||
|
|
s->block[0]->comp_size = s->block[0]->uncomp_size;
|
||
|
|
|
||
|
|
// Make sure the fixed blocks point to the correct sources
|
||
|
|
if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
|
||
|
|
s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
|
||
|
|
if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
|
||
|
|
s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
|
||
|
|
if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
|
||
|
|
s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
|
||
|
|
if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
|
||
|
|
s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
|
||
|
|
|
||
|
|
// Finalise any data transforms.
|
||
|
|
for (id = DS_QS; id < DS_TN; id++) {
|
||
|
|
if (h->codecs[id] && h->codecs[id]->flush)
|
||
|
|
h->codecs[id]->flush(h->codecs[id]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Ensure block sizes are up to date.
|
||
|
|
for (id = 1; id < s->hdr->num_blocks; id++) {
|
||
|
|
if (!s->block[id] || s->block[id] == s->block[0])
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (s->block[id]->uncomp_size == 0)
|
||
|
|
BLOCK_UPLEN(s->block[id]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Compress it all
|
||
|
|
if (cram_compress_slice(fd, c, s) == -1)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// Collapse empty blocks and create hdr_block
|
||
|
|
{
|
||
|
|
int i, j;
|
||
|
|
|
||
|
|
s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
|
||
|
|
s->hdr->num_blocks * sizeof(int32_t));
|
||
|
|
if (!s->hdr->block_content_ids)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
for (i = j = 1; i < s->hdr->num_blocks; i++) {
|
||
|
|
if (!s->block[i] || s->block[i] == s->block[0])
|
||
|
|
continue;
|
||
|
|
if (s->block[i]->uncomp_size == 0) {
|
||
|
|
cram_free_block(s->block[i]);
|
||
|
|
s->block[i] = NULL;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
s->block[j] = s->block[i];
|
||
|
|
s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
|
||
|
|
j++;
|
||
|
|
}
|
||
|
|
s->hdr->num_content_ids = j-1;
|
||
|
|
s->hdr->num_blocks = j;
|
||
|
|
|
||
|
|
if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
return r ? -1 : 0;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static inline const char *bam_data_end(bam1_t *b) {
|
||
|
|
return (const char *)b->data + b->l_data;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* A bounds checking version of bam_aux2i.
|
||
|
|
*/
|
||
|
|
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
|
||
|
|
int type = *aux++;
|
||
|
|
switch (type) {
|
||
|
|
case 'c':
|
||
|
|
if (aux_end - aux < 1) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return *(int8_t *)aux;
|
||
|
|
case 'C':
|
||
|
|
if (aux_end - aux < 1) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return *aux;
|
||
|
|
case 's':
|
||
|
|
if (aux_end - aux < 2) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return le_to_i16(aux);
|
||
|
|
case 'S':
|
||
|
|
if (aux_end - aux < 2) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return le_to_u16(aux);
|
||
|
|
case 'i':
|
||
|
|
if (aux_end - aux < 4) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return le_to_i32(aux);
|
||
|
|
case 'I':
|
||
|
|
if (aux_end - aux < 4) {
|
||
|
|
errno = EINVAL;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
return le_to_u32(aux);
|
||
|
|
default:
|
||
|
|
errno = EINVAL;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Returns the number of expected read names for this record.
|
||
|
|
*/
|
||
|
|
static int expected_template_count(bam_seq_t *b) {
|
||
|
|
int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
|
||
|
|
|
||
|
|
uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
|
||
|
|
if (TC) {
|
||
|
|
int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
|
||
|
|
if (expected < n)
|
||
|
|
expected = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!TC && bam_aux_get(b, "SA")) {
|
||
|
|
// We could count the semicolons, but we'd have to do this for
|
||
|
|
// read1, read2 and read(not-1-or-2) combining the results
|
||
|
|
// together. This is a cheap and safe alternative for now.
|
||
|
|
expected = INT_MAX;
|
||
|
|
}
|
||
|
|
|
||
|
|
return expected;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Lossily reject read names.
|
||
|
|
*
|
||
|
|
* The rule here is that if all reads for this template reside in the
|
||
|
|
* same slice then we can lose the name. Otherwise we keep them as we
|
||
|
|
* do not know when (or if) the other reads will turn up.
|
||
|
|
*
|
||
|
|
* Note there may be only 1 read (non-paired library) or more than 2
|
||
|
|
* reads (paired library with supplementary reads), or other weird
|
||
|
|
* setups. We need to know how many are expected. Ways to guess:
|
||
|
|
*
|
||
|
|
* - Flags (0x1 - has > 1 read)
|
||
|
|
* - TC aux field (not mandatory)
|
||
|
|
* - SA tags (count semicolons, NB per fragment so sum - hard)
|
||
|
|
* - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
|
||
|
|
int bam_start) {
|
||
|
|
int r1, r2, ret = -1;
|
||
|
|
|
||
|
|
// Initialise cram_flags
|
||
|
|
for (r2 = 0; r2 < s->hdr->num_records; r2++)
|
||
|
|
s->crecs[r2].cram_flags = 0;
|
||
|
|
|
||
|
|
if (!fd->lossy_read_names)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
khash_t(m_s2u64) *names = kh_init(m_s2u64);
|
||
|
|
if (!names)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
// 1: Iterate through names to count frequency
|
||
|
|
for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
|
||
|
|
//cram_record *cr = &s->crecs[r2];
|
||
|
|
bam_seq_t *b = c->bams[r1];
|
||
|
|
khint_t k;
|
||
|
|
int n;
|
||
|
|
uint64_t e;
|
||
|
|
union {
|
||
|
|
uint64_t i64;
|
||
|
|
struct {
|
||
|
|
int32_t e,c; // expected & observed counts.
|
||
|
|
} counts;
|
||
|
|
} u;
|
||
|
|
|
||
|
|
e = expected_template_count(b);
|
||
|
|
u.counts.e = e; u.counts.c = 1;
|
||
|
|
|
||
|
|
k = kh_put(m_s2u64, names, bam_name(b), &n);
|
||
|
|
if (n == -1)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (n == 0) {
|
||
|
|
// not a new name
|
||
|
|
u.i64 = kh_val(names, k);
|
||
|
|
if (u.counts.e != e) {
|
||
|
|
// different expectation or already hit the max
|
||
|
|
//fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
|
||
|
|
kh_val(names, k) = 0;
|
||
|
|
} else {
|
||
|
|
u.counts.c++;
|
||
|
|
if (u.counts.e == u.counts.c) {
|
||
|
|
// Reached expected count.
|
||
|
|
kh_val(names, k) = -1;
|
||
|
|
} else {
|
||
|
|
kh_val(names, k) = u.i64;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// new name
|
||
|
|
kh_val(names, k) = u.i64;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 2: Remove names if all present (hd.i == -1)
|
||
|
|
for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
|
||
|
|
cram_record *cr = &s->crecs[r2];
|
||
|
|
bam_seq_t *b = c->bams[r1];
|
||
|
|
khint_t k;
|
||
|
|
|
||
|
|
k = kh_get(m_s2u64, names, bam_name(b));
|
||
|
|
|
||
|
|
if (k == kh_end(names))
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
if (kh_val(names, k) == -1)
|
||
|
|
cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
|
||
|
|
}
|
||
|
|
|
||
|
|
ret = 0;
|
||
|
|
fail: // ret==-1
|
||
|
|
|
||
|
|
if (names)
|
||
|
|
kh_destroy(m_s2u64, names);
|
||
|
|
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Adds the reading names. We do this here as a separate pass rather
|
||
|
|
* than per record in the process_one_read calls as that function can
|
||
|
|
* go back and change the CRAM_FLAG_DETACHED status of a previously
|
||
|
|
* processed read if it subsequently determines the TLEN field is
|
||
|
|
* incorrect. Given DETACHED reads always try to decode read names,
|
||
|
|
* we need to know their status before generating the read-name block.
|
||
|
|
*
|
||
|
|
* Output is an update s->name_blk, and cr->name / cr->name_len
|
||
|
|
* fields.
|
||
|
|
*/
|
||
|
|
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
|
||
|
|
int bam_start) {
|
||
|
|
int r1, r2;
|
||
|
|
int keep_names = !fd->lossy_read_names;
|
||
|
|
|
||
|
|
for (r1 = bam_start, r2 = 0;
|
||
|
|
r1 < c->curr_c_rec && r2 < s->hdr->num_records;
|
||
|
|
r1++, r2++) {
|
||
|
|
cram_record *cr = &s->crecs[r2];
|
||
|
|
bam_seq_t *b = c->bams[r1];
|
||
|
|
|
||
|
|
cr->name = BLOCK_SIZE(s->name_blk);
|
||
|
|
if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4
|
||
|
|
&& (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
|
||
|
|
&& cr->mate_line) {
|
||
|
|
// Dedup read names in V4
|
||
|
|
BLOCK_APPEND(s->name_blk, "\0", 1);
|
||
|
|
cr->name_len = 1;
|
||
|
|
} else {
|
||
|
|
BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
|
||
|
|
cr->name_len = bam_name_len(b);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// Can only discard duplicate names if not detached
|
||
|
|
cr->name_len = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
|
||
|
|
goto block_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// CRAM version >= 3.1
|
||
|
|
#define CRAM_ge31(v) ((v) >= 0x301)
|
||
|
|
|
||
|
|
// Returns the next cigar op code: one of the BAM_C* codes,
|
||
|
|
// or -1 if no more are present.
|
||
|
|
static inline
|
||
|
|
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
|
||
|
|
uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
|
||
|
|
for(;;) {
|
||
|
|
while (*cig_len == 0) {
|
||
|
|
if (*cig_ind < ncigar) {
|
||
|
|
*cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK;
|
||
|
|
*cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
|
||
|
|
(*cig_ind)++;
|
||
|
|
} else {
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (skip[*cig_op]) {
|
||
|
|
*spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
|
||
|
|
*cig_len = 0;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
(*cig_len)--;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
return *cig_op;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Ensure ref and hist are large enough.
|
||
|
|
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
|
||
|
|
hts_pos_t ref_start, hts_pos_t *ref_end) {
|
||
|
|
if (pos < ref_start)
|
||
|
|
return -1;
|
||
|
|
if (pos < *ref_end)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
// realloc
|
||
|
|
if (pos - ref_start > UINT_MAX)
|
||
|
|
return -2; // protect overflow in new_end calculation
|
||
|
|
|
||
|
|
hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
|
||
|
|
hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
|
||
|
|
|
||
|
|
// Refuse to work on excessively large blocks.
|
||
|
|
// We'll just switch to referenceless encoding, which is probably better
|
||
|
|
// here as this must be very sparse data anyway.
|
||
|
|
if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
|
||
|
|
return -2;
|
||
|
|
|
||
|
|
char *tmp = realloc(*ref, new_end-ref_start+1);
|
||
|
|
if (!tmp)
|
||
|
|
return -1;
|
||
|
|
*ref = tmp;
|
||
|
|
|
||
|
|
uint32_t (*tmp5)[5] = realloc(**hist,
|
||
|
|
(new_end - ref_start)*sizeof(**hist));
|
||
|
|
if (!tmp5)
|
||
|
|
return -1;
|
||
|
|
*hist = tmp5;
|
||
|
|
*ref_end = new_end;
|
||
|
|
|
||
|
|
// initialise
|
||
|
|
old_end -= ref_start;
|
||
|
|
new_end -= ref_start;
|
||
|
|
memset(&(*ref)[old_end], 0, new_end-old_end);
|
||
|
|
memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Walk through MD + seq to generate ref
|
||
|
|
// Returns 1 on success, <0 on failure
|
||
|
|
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
|
||
|
|
hts_pos_t ref_start, hts_pos_t *ref_end,
|
||
|
|
const uint8_t *MD) {
|
||
|
|
uint8_t *seq = bam_get_seq(b);
|
||
|
|
uint32_t *cigar = bam_get_cigar(b);
|
||
|
|
uint32_t ncigar = b->core.n_cigar;
|
||
|
|
uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
|
||
|
|
|
||
|
|
int iseq = 0, next_op;
|
||
|
|
hts_pos_t iref = b->core.pos - ref_start;
|
||
|
|
|
||
|
|
// Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
|
||
|
|
static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
|
||
|
|
while (iseq < b->core.l_qseq && *MD) {
|
||
|
|
if (isdigit(*MD)) {
|
||
|
|
// match
|
||
|
|
int overflow = 0;
|
||
|
|
int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
|
||
|
|
if (overflow ||
|
||
|
|
extend_ref(ref, hist, iref+ref_start + len,
|
||
|
|
ref_start, ref_end) < 0)
|
||
|
|
return -1;
|
||
|
|
while (iseq < b->core.l_qseq && len) {
|
||
|
|
// rewrite to have internal loops?
|
||
|
|
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
|
||
|
|
&iseq, &cig_ind, &cig_op,
|
||
|
|
&cig_len)) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (next_op != BAM_CMATCH &&
|
||
|
|
next_op != BAM_CEQUAL) {
|
||
|
|
hts_log_info("MD:Z and CIGAR are incompatible for "
|
||
|
|
"record %s", bam_get_qname(b));
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Short-cut loop over same cigar op for efficiency
|
||
|
|
cig_len++;
|
||
|
|
do {
|
||
|
|
cig_len--;
|
||
|
|
(*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
|
||
|
|
iseq++;
|
||
|
|
len--;
|
||
|
|
} while (cig_len && iseq < b->core.l_qseq && len);
|
||
|
|
}
|
||
|
|
if (len > 0)
|
||
|
|
return -1; // MD is longer than seq
|
||
|
|
} else if (*MD == '^') {
|
||
|
|
// deletion
|
||
|
|
MD++;
|
||
|
|
while (isalpha(*MD)) {
|
||
|
|
if (extend_ref(ref, hist, iref+ref_start, ref_start,
|
||
|
|
ref_end) < 0)
|
||
|
|
return -1;
|
||
|
|
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
|
||
|
|
&iseq, &cig_ind, &cig_op,
|
||
|
|
&cig_len)) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (next_op != BAM_CDEL) {
|
||
|
|
hts_log_info("MD:Z and CIGAR are incompatible");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
(*ref)[iref++] = *MD++ & ~0x20;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// substitution
|
||
|
|
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
|
||
|
|
return -1;
|
||
|
|
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
|
||
|
|
&iseq, &cig_ind, &cig_op,
|
||
|
|
&cig_len)) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
|
||
|
|
hts_log_info("MD:Z and CIGAR are incompatible");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
(*ref)[iref++] = *MD++ & ~0x20;
|
||
|
|
iseq++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Append a sequence to a ref/consensus structure.
|
||
|
|
// We maintain both an absolute refefence (ACGTN where MD:Z is
|
||
|
|
// present) and a 5-way frequency array for when no MD:Z is known.
|
||
|
|
// We then subsequently convert the 5-way frequencies to a consensus
|
||
|
|
// ref in a second pass.
|
||
|
|
//
|
||
|
|
// Returns >=0 on success,
|
||
|
|
// -1 on failure (eg inconsistent data)
|
||
|
|
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
|
||
|
|
hts_pos_t ref_start, hts_pos_t *ref_end) {
|
||
|
|
const uint8_t *MD = bam_aux_get(b, "MD");
|
||
|
|
int ret = 0;
|
||
|
|
if (MD && *MD == 'Z') {
|
||
|
|
// We can use MD to directly compute the reference
|
||
|
|
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
|
||
|
|
|
||
|
|
if (ret > 0)
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Otherwise we just use SEQ+CIGAR and build a consensus which we later
|
||
|
|
// turn into a fake reference
|
||
|
|
uint32_t *cigar = bam_get_cigar(b);
|
||
|
|
uint32_t ncigar = b->core.n_cigar;
|
||
|
|
uint32_t i, j;
|
||
|
|
hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
|
||
|
|
uint8_t *seq = bam_get_seq(b);
|
||
|
|
for (i = 0; i < ncigar; i++) {
|
||
|
|
switch (bam_cigar_op(cigar[i])) {
|
||
|
|
case BAM_CSOFT_CLIP:
|
||
|
|
case BAM_CINS:
|
||
|
|
iseq += bam_cigar_oplen(cigar[i]);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CMATCH:
|
||
|
|
case BAM_CEQUAL:
|
||
|
|
case BAM_CDIFF: {
|
||
|
|
int len = bam_cigar_oplen(cigar[i]);
|
||
|
|
// Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
|
||
|
|
static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
|
||
|
|
|
||
|
|
if (extend_ref(ref, hist, iref+ref_start + len,
|
||
|
|
ref_start, ref_end) < 0)
|
||
|
|
return -1;
|
||
|
|
if (iseq + len <= b->core.l_qseq) {
|
||
|
|
// Nullify failed MD:Z if appropriate
|
||
|
|
if (ret < 0)
|
||
|
|
memset(&(*ref)[iref], 0, len);
|
||
|
|
|
||
|
|
for (j = 0; j < len; j++, iref++, iseq++)
|
||
|
|
(*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
|
||
|
|
} else {
|
||
|
|
// Probably a 2ndary read with seq "*"
|
||
|
|
iseq += len;
|
||
|
|
iref += len;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case BAM_CDEL:
|
||
|
|
case BAM_CREF_SKIP:
|
||
|
|
iref += bam_cigar_oplen(cigar[i]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Automatically generates the reference and stashed it in c->ref, also
|
||
|
|
// setting c->ref_start and c->ref_end.
|
||
|
|
//
|
||
|
|
// If we have MD:Z tags then we use them to directly infer the reference,
|
||
|
|
// along with SEQ + CIGAR. Otherwise we use SEQ/CIGAR only to build up
|
||
|
|
// a consensus and then assume the reference as the majority rule.
|
||
|
|
//
|
||
|
|
// In this latter scenario we need to be wary of auto-generating MD and NM
|
||
|
|
// during decode, but that's handled elsewhere via an additional aux tag.
|
||
|
|
//
|
||
|
|
// Returns 0 on success,
|
||
|
|
// -1 on failure
|
||
|
|
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
|
||
|
|
// TODO: if we can find an external reference then use it, even if the
|
||
|
|
// user told us to do embed_ref=2.
|
||
|
|
char *ref = NULL;
|
||
|
|
uint32_t (*hist)[5] = NULL;
|
||
|
|
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
|
||
|
|
if (ref_start < 0)
|
||
|
|
return -1; // cannot build consensus from unmapped data
|
||
|
|
|
||
|
|
// initial allocation
|
||
|
|
if (extend_ref(&ref, &hist,
|
||
|
|
c->bams[r1 + s->hdr->num_records-1]->core.pos +
|
||
|
|
c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
|
||
|
|
ref_start, &ref_end) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// Add each bam file to the reference/consensus arrays
|
||
|
|
int r2;
|
||
|
|
hts_pos_t last_pos = -1;
|
||
|
|
for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
|
||
|
|
if (c->bams[r1]->core.pos < last_pos) {
|
||
|
|
hts_log_error("Cannot build reference with unsorted data");
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
last_pos = c->bams[r1]->core.pos;
|
||
|
|
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Compute the consensus
|
||
|
|
hts_pos_t i;
|
||
|
|
for (i = 0; i < ref_end-ref_start; i++) {
|
||
|
|
if (!ref[i]) {
|
||
|
|
int max_v = 0, max_j = 4, j;
|
||
|
|
for (j = 0; j < 4; j++)
|
||
|
|
// don't call N (j==4) unless no coverage
|
||
|
|
if (max_v < hist[i][j])
|
||
|
|
max_v = hist[i][j], max_j = j;
|
||
|
|
ref[i] = "ACGTN"[max_j];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
free(hist);
|
||
|
|
|
||
|
|
// Put the reference in place so it appears to be an external
|
||
|
|
// ref file.
|
||
|
|
c->ref = ref;
|
||
|
|
c->ref_start = ref_start+1;
|
||
|
|
c->ref_end = ref_end+1;
|
||
|
|
c->ref_free = 1;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
err:
|
||
|
|
free(ref);
|
||
|
|
free(hist);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if the SQ M5 tag matches the reference we've loaded.
|
||
|
|
static int validate_md5(cram_fd *fd, int ref_id) {
|
||
|
|
if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
// Have we already checked this ref?
|
||
|
|
if (fd->refs->ref_id[ref_id]->validated_md5)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
// Check if we have the MD5 known.
|
||
|
|
// We should, but maybe we're using embedded references?
|
||
|
|
sam_hrecs_t *hrecs = fd->header->hrecs;
|
||
|
|
sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
|
||
|
|
hrecs->ref[ref_id].name);
|
||
|
|
if (!ty)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
|
||
|
|
if (!m5tag)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
// It's known, so compute md5 on the loaded reference sequence.
|
||
|
|
char *ref = fd->refs->ref_id[ref_id]->seq;
|
||
|
|
int64_t len = fd->refs->ref_id[ref_id]->length;
|
||
|
|
hts_md5_context *md5;
|
||
|
|
char unsigned buf[16];
|
||
|
|
char buf2[33];
|
||
|
|
|
||
|
|
if (!(md5 = hts_md5_init()))
|
||
|
|
return -1;
|
||
|
|
hts_md5_update(md5, ref, len);
|
||
|
|
hts_md5_final(buf, md5);
|
||
|
|
hts_md5_destroy(md5);
|
||
|
|
hts_md5_hex(buf2, buf);
|
||
|
|
|
||
|
|
// Compare it to header @SQ M5 tag
|
||
|
|
if (strcmp(m5tag->str+3, buf2)) {
|
||
|
|
hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
|
||
|
|
hrecs->ref[ref_id].name);
|
||
|
|
hts_log_error("Please use the correct reference, or "
|
||
|
|
"consider using embed_ref=2");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
fd->refs->ref_id[ref_id]->validated_md5 = 1;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Encodes all slices in a container into blocks.
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
int cram_encode_container(cram_fd *fd, cram_container *c) {
|
||
|
|
int i, j, slice_offset;
|
||
|
|
cram_block_compression_hdr *h = c->comp_hdr;
|
||
|
|
cram_block *c_hdr;
|
||
|
|
int multi_ref = 0;
|
||
|
|
int r1, r2, sn, nref, embed_ref, no_ref;
|
||
|
|
spare_bams *spares;
|
||
|
|
|
||
|
|
if (!c->bams)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
|
||
|
|
#define goto_err goto err
|
||
|
|
|
||
|
|
// Don't try embed ref if we repeatedly fail
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
|
||
|
|
if (!failed_embed && c->embed_ref == -2) {
|
||
|
|
hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
|
||
|
|
fd->no_ref = c->no_ref = 0;
|
||
|
|
fd->embed_ref = c->embed_ref = 2;
|
||
|
|
} else if (failed_embed && c->embed_ref == -2) {
|
||
|
|
// We've tried several times, so this time give up for good
|
||
|
|
hts_log_warning("Keeping non-ref mode from now on");
|
||
|
|
fd->embed_ref = c->embed_ref = 0;
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
|
||
|
|
restart:
|
||
|
|
/* Cache references up-front if we have unsorted access patterns */
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
nref = fd->refs->nref;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
embed_ref = c->embed_ref;
|
||
|
|
no_ref = c->no_ref;
|
||
|
|
|
||
|
|
/* To create M5 strings */
|
||
|
|
/* Fetch reference sequence */
|
||
|
|
if (!no_ref) {
|
||
|
|
if (!c->bams || !c->curr_c_rec || !c->bams[0])
|
||
|
|
goto_err;
|
||
|
|
bam_seq_t *b = c->bams[0];
|
||
|
|
|
||
|
|
if (embed_ref <= 1) {
|
||
|
|
char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
|
||
|
|
if (!ref && bam_ref(b) >= 0) {
|
||
|
|
if (!c->pos_sorted) {
|
||
|
|
// TODO: maybe also check fd->no_ref?
|
||
|
|
hts_log_warning("Failed to load reference #%d",
|
||
|
|
bam_ref(b));
|
||
|
|
hts_log_warning("Switching to non-ref mode");
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
c->embed_ref = fd->embed_ref = 0;
|
||
|
|
c->no_ref = fd->no_ref = 1;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
goto restart;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (c->multi_seq || embed_ref == 0) {
|
||
|
|
hts_log_error("Failed to load reference #%d", bam_ref(b));
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
hts_log_warning("Failed to load reference #%d", bam_ref(b));
|
||
|
|
hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
|
||
|
|
" reference");
|
||
|
|
if (embed_ref <= 0)
|
||
|
|
hts_log_warning("NOTE: the CRAM file will be bigger than"
|
||
|
|
" using an external reference");
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
embed_ref = c->embed_ref = fd->embed_ref = 2;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
goto auto_ref;
|
||
|
|
} else if (ref) {
|
||
|
|
if (validate_md5(fd, c->ref_seq_id) < 0)
|
||
|
|
goto_err;
|
||
|
|
}
|
||
|
|
if ((c->ref_id = bam_ref(b)) >= 0) {
|
||
|
|
c->ref_seq_id = c->ref_id;
|
||
|
|
c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
|
||
|
|
c->ref_start = 1;
|
||
|
|
c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
auto_ref:
|
||
|
|
// Auto-embed ref.
|
||
|
|
// This starts as 'N' and is amended on-the-fly as we go
|
||
|
|
// based on MD:Z tags.
|
||
|
|
if ((c->ref_id = bam_ref(b)) >= 0) {
|
||
|
|
c->ref = NULL;
|
||
|
|
// c->ref_free is boolean; whether to free c->ref. In this
|
||
|
|
// case c->ref will be our auto-embedded sequence instead of
|
||
|
|
// a "global" portion of reference from fd->refs.
|
||
|
|
// Do not confuse with fd->ref_free which is a pointer to a
|
||
|
|
// reference string to free.
|
||
|
|
c->ref_free = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
c->ref_seq_id = c->ref_id;
|
||
|
|
} else {
|
||
|
|
c->ref_id = bam_ref(c->bams[0]);
|
||
|
|
cram_ref_incr(fd->refs, c->ref_id);
|
||
|
|
c->ref_seq_id = c->ref_id;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!no_ref && c->refs_used) {
|
||
|
|
for (i = 0; i < nref; i++) {
|
||
|
|
if (c->refs_used[i]) {
|
||
|
|
if (cram_get_ref(fd, i, 1, 0)) {
|
||
|
|
if (validate_md5(fd, i) < 0)
|
||
|
|
goto_err;
|
||
|
|
} else {
|
||
|
|
hts_log_warning("Failed to find reference, "
|
||
|
|
"switching to non-ref mode");
|
||
|
|
no_ref = c->no_ref = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Turn bams into cram_records and gather basic stats */
|
||
|
|
for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
|
||
|
|
cram_slice *s = c->slices[sn];
|
||
|
|
int64_t first_base = INT64_MAX, last_base = INT64_MIN;
|
||
|
|
|
||
|
|
int r1_start = r1;
|
||
|
|
|
||
|
|
assert(sn < c->curr_slice);
|
||
|
|
|
||
|
|
// Discover which read names *may* be safely removed.
|
||
|
|
// Ie which ones have all their records in this slice.
|
||
|
|
if (lossy_read_names(fd, c, s, r1_start) != 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
// Tracking of MD tags so we can spot when the auto-generated values
|
||
|
|
// will differ from the current stored ones. The kstring here is
|
||
|
|
// simply to avoid excessive malloc and free calls. All initialisation
|
||
|
|
// is done within process_one_read().
|
||
|
|
kstring_t MD = {0};
|
||
|
|
|
||
|
|
// Embed consensus / MD-generated ref
|
||
|
|
if (embed_ref == 2) {
|
||
|
|
if (cram_generate_reference(c, s, r1) < 0) {
|
||
|
|
// Should this be a permanent thing via fd->no_ref?
|
||
|
|
// Doing so means we cannot easily switch back again should
|
||
|
|
// things fix themselves later on. This is likely not a
|
||
|
|
// concern though as failure to generate a reference implies
|
||
|
|
// unsorted data which is rarely recovered from.
|
||
|
|
|
||
|
|
// Only if sn == 0. We're hosed if we're on the 2nd slice and
|
||
|
|
// the first worked, as no-ref is a container global param.
|
||
|
|
if (sn > 0) {
|
||
|
|
hts_log_error("Failed to build reference, "
|
||
|
|
"switching to non-ref mode");
|
||
|
|
return -1;
|
||
|
|
} else {
|
||
|
|
hts_log_warning("Failed to build reference, "
|
||
|
|
"switching to non-ref mode");
|
||
|
|
}
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
|
||
|
|
c->no_ref = fd->no_ref = 1;
|
||
|
|
fd->no_ref_counter++; // more likely to keep permanent action
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
failed_embed = 1;
|
||
|
|
goto restart;
|
||
|
|
} else {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
fd->no_ref_counter -= (fd->no_ref_counter > 0);
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Iterate through records creating the cram blocks for some
|
||
|
|
// fields and just gathering stats for others.
|
||
|
|
for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
|
||
|
|
cram_record *cr = &s->crecs[r2];
|
||
|
|
bam_seq_t *b = c->bams[r1];
|
||
|
|
|
||
|
|
/* If multi-ref we need to cope with changing reference per seq */
|
||
|
|
if (c->multi_seq && !no_ref) {
|
||
|
|
if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
|
||
|
|
if (c->ref_seq_id >= 0)
|
||
|
|
cram_ref_decr(fd->refs, c->ref_seq_id);
|
||
|
|
|
||
|
|
if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
|
||
|
|
hts_log_error("Failed to load reference #%d", bam_ref(b));
|
||
|
|
free(MD.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if (validate_md5(fd, bam_ref(b)) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
c->ref_seq_id = bam_ref(b); // overwritten later by -2
|
||
|
|
if (!fd->refs->ref_id[c->ref_seq_id]->seq)
|
||
|
|
return -1;
|
||
|
|
c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
|
||
|
|
c->ref_start = 1;
|
||
|
|
c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
|
||
|
|
no_ref) != 0) {
|
||
|
|
free(MD.s);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (first_base > cr->apos)
|
||
|
|
first_base = cr->apos;
|
||
|
|
|
||
|
|
if (last_base < cr->aend)
|
||
|
|
last_base = cr->aend;
|
||
|
|
}
|
||
|
|
|
||
|
|
free(MD.s);
|
||
|
|
|
||
|
|
// Process_one_read doesn't add read names as it can change
|
||
|
|
// its mind during the loop on the CRAM_FLAG_DETACHED setting
|
||
|
|
// of earlier records (if it detects the auto-generation of
|
||
|
|
// TLEN is incorrect). This affects which read-names can be
|
||
|
|
// lossily compressed, so we do these in another pass.
|
||
|
|
if (add_read_names(fd, c, s, r1_start) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (c->multi_seq) {
|
||
|
|
s->hdr->ref_seq_id = -2;
|
||
|
|
s->hdr->ref_seq_start = 0;
|
||
|
|
s->hdr->ref_seq_span = 0;
|
||
|
|
} else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
|
||
|
|
// Spec states span=0, but it broke our range queries.
|
||
|
|
// See commit message for this and prior.
|
||
|
|
s->hdr->ref_seq_id = -1;
|
||
|
|
s->hdr->ref_seq_start = 0;
|
||
|
|
s->hdr->ref_seq_span = 0;
|
||
|
|
} else {
|
||
|
|
s->hdr->ref_seq_id = c->ref_id;
|
||
|
|
s->hdr->ref_seq_start = first_base;
|
||
|
|
s->hdr->ref_seq_span = MAX(0, last_base - first_base + 1);
|
||
|
|
}
|
||
|
|
s->hdr->num_records = r2;
|
||
|
|
|
||
|
|
// Processed a slice, now stash the aux blocks so the next
|
||
|
|
// slice can start aggregating them from the start again.
|
||
|
|
if (c->tags_used->n_occupied) {
|
||
|
|
int ntags = c->tags_used->n_occupied;
|
||
|
|
s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
|
||
|
|
if (!s->aux_block)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
khint_t k;
|
||
|
|
|
||
|
|
s->naux_block = 0;
|
||
|
|
for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
|
||
|
|
if (!kh_exist(c->tags_used, k))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
cram_tag_map *tm = kh_val(c->tags_used, k);
|
||
|
|
if (!tm) goto_err;
|
||
|
|
if (!tm->blk) continue;
|
||
|
|
s->aux_block[s->naux_block++] = tm->blk;
|
||
|
|
tm->blk = NULL;
|
||
|
|
if (!tm->blk2) continue;
|
||
|
|
s->aux_block[s->naux_block++] = tm->blk2;
|
||
|
|
tm->blk2 = NULL;
|
||
|
|
}
|
||
|
|
assert(s->naux_block <= 2*c->tags_used->n_occupied);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (c->multi_seq && !no_ref) {
|
||
|
|
if (c->ref_seq_id >= 0)
|
||
|
|
cram_ref_decr(fd->refs, c->ref_seq_id);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Link our bams[] array onto the spare bam list for reuse */
|
||
|
|
spares = malloc(sizeof(*spares));
|
||
|
|
if (!spares) goto_err;
|
||
|
|
pthread_mutex_lock(&fd->bam_list_lock);
|
||
|
|
spares->bams = c->bams;
|
||
|
|
spares->next = fd->bl;
|
||
|
|
fd->bl = spares;
|
||
|
|
pthread_mutex_unlock(&fd->bam_list_lock);
|
||
|
|
c->bams = NULL;
|
||
|
|
|
||
|
|
/* Detect if a multi-seq container */
|
||
|
|
cram_stats_encoding(fd, c->stats[DS_RI]);
|
||
|
|
multi_ref = c->stats[DS_RI]->nvals > 1;
|
||
|
|
pthread_mutex_lock(&fd->metrics_lock);
|
||
|
|
fd->last_RI_count = c->stats[DS_RI]->nvals;
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
|
||
|
|
|
||
|
|
if (multi_ref) {
|
||
|
|
hts_log_info("Multi-ref container");
|
||
|
|
c->ref_seq_id = -2;
|
||
|
|
c->ref_seq_start = 0;
|
||
|
|
c->ref_seq_span = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/* Compute MD5s */
|
||
|
|
no_ref = c->no_ref;
|
||
|
|
int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
|
||
|
|
|
||
|
|
for (i = 0; i < c->curr_slice; i++) {
|
||
|
|
cram_slice *s = c->slices[i];
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) != 1) {
|
||
|
|
if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
|
||
|
|
hts_md5_context *md5 = hts_md5_init();
|
||
|
|
if (!md5)
|
||
|
|
return -1;
|
||
|
|
hts_md5_update(md5,
|
||
|
|
c->ref + s->hdr->ref_seq_start - c->ref_start,
|
||
|
|
s->hdr->ref_seq_span);
|
||
|
|
hts_md5_final(s->hdr->md5, md5);
|
||
|
|
hts_md5_destroy(md5);
|
||
|
|
} else {
|
||
|
|
memset(s->hdr->md5, 0, 16);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
c->num_records = 0;
|
||
|
|
c->num_blocks = 1; // cram_block_compression_hdr
|
||
|
|
c->length = 0;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== BF ===\n");
|
||
|
|
h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
|
||
|
|
c->stats[DS_BF], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== CF ===\n");
|
||
|
|
h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
|
||
|
|
c->stats[DS_CF], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== RN ===\n");
|
||
|
|
//h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
|
||
|
|
// c->stats[DS_RN], E_BYTE_ARRAY, NULL,
|
||
|
|
// fd->version);
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== AP ===\n");
|
||
|
|
if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
if (c->pos_sorted)
|
||
|
|
h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
|
||
|
|
c->stats[DS_AP],
|
||
|
|
is_v4 ? E_LONG : E_INT,
|
||
|
|
NULL, fd->version, &fd->vv);
|
||
|
|
else
|
||
|
|
// Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
|
||
|
|
h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
|
||
|
|
: E_EXTERNAL,
|
||
|
|
NULL,
|
||
|
|
is_v4 ? E_LONG : E_INT,
|
||
|
|
NULL, fd->version, &fd->vv);
|
||
|
|
} else {
|
||
|
|
// Removed BETA in v4.0.
|
||
|
|
// Should we consider dropping use of it for 3.0 too?
|
||
|
|
hts_pos_t p[2] = {0, c->max_apos};
|
||
|
|
h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
|
||
|
|
is_v4 ? E_LONG : E_INT,
|
||
|
|
p, fd->version, &fd->vv);
|
||
|
|
// cram_xdelta_encoder e;
|
||
|
|
// e.word_size = is_v4 ? 8 : 4;
|
||
|
|
// e.sub_encoding = E_EXTERNAL;
|
||
|
|
// e.sub_codec_dat = (void *)DS_AP;
|
||
|
|
//
|
||
|
|
// h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
|
||
|
|
// is_v4 ? E_LONG : E_INT,
|
||
|
|
// &e, fd->version, &fd->vv);
|
||
|
|
}
|
||
|
|
if (!h->codecs[DS_AP]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== RG ===\n");
|
||
|
|
h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
|
||
|
|
c->stats[DS_RG],
|
||
|
|
E_INT,
|
||
|
|
NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== MQ ===\n");
|
||
|
|
h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
|
||
|
|
c->stats[DS_MQ], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== NS ===\n");
|
||
|
|
h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
|
||
|
|
c->stats[DS_NS], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== MF ===\n");
|
||
|
|
h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
|
||
|
|
c->stats[DS_MF], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== TS ===\n");
|
||
|
|
h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
|
||
|
|
c->stats[DS_TS],
|
||
|
|
is_v4 ? E_LONG : E_INT,
|
||
|
|
NULL, fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== NP ===\n");
|
||
|
|
h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
|
||
|
|
c->stats[DS_NP],
|
||
|
|
is_v4 ? E_LONG : E_INT,
|
||
|
|
NULL, fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== NF ===\n");
|
||
|
|
h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
|
||
|
|
c->stats[DS_NF], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== RL ===\n");
|
||
|
|
h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
|
||
|
|
c->stats[DS_RL], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== FN ===\n");
|
||
|
|
h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
|
||
|
|
c->stats[DS_FN], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== FC ===\n");
|
||
|
|
h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
|
||
|
|
c->stats[DS_FC], E_BYTE, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== FP ===\n");
|
||
|
|
h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
|
||
|
|
c->stats[DS_FP], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== DL ===\n");
|
||
|
|
h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
|
||
|
|
c->stats[DS_DL], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== BA ===\n");
|
||
|
|
h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
|
||
|
|
c->stats[DS_BA], E_BYTE, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
||
|
|
cram_byte_array_len_encoder e;
|
||
|
|
|
||
|
|
e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
|
||
|
|
? E_VARINT_UNSIGNED
|
||
|
|
: E_EXTERNAL;
|
||
|
|
e.len_dat = (void *)DS_BB_len;
|
||
|
|
//e.len_dat = (void *)DS_BB;
|
||
|
|
|
||
|
|
e.val_encoding = E_EXTERNAL;
|
||
|
|
e.val_dat = (void *)DS_BB;
|
||
|
|
|
||
|
|
h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)&e,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (!h->codecs[DS_BB]) goto_err;
|
||
|
|
} else {
|
||
|
|
h->codecs[DS_BB] = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== BS ===\n");
|
||
|
|
h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
|
||
|
|
c->stats[DS_BS], E_BYTE, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
h->codecs[DS_TL] = NULL;
|
||
|
|
h->codecs[DS_RI] = NULL;
|
||
|
|
h->codecs[DS_RS] = NULL;
|
||
|
|
h->codecs[DS_PD] = NULL;
|
||
|
|
h->codecs[DS_HC] = NULL;
|
||
|
|
h->codecs[DS_SC] = NULL;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== TC ===\n");
|
||
|
|
h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
|
||
|
|
c->stats[DS_TC], E_BYTE, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== TN ===\n");
|
||
|
|
h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
|
||
|
|
c->stats[DS_TN], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
|
||
|
|
} else {
|
||
|
|
h->codecs[DS_TC] = NULL;
|
||
|
|
h->codecs[DS_TN] = NULL;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== TL ===\n");
|
||
|
|
h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
|
||
|
|
c->stats[DS_TL], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
|
||
|
|
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== RI ===\n");
|
||
|
|
h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
|
||
|
|
c->stats[DS_RI], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== RS ===\n");
|
||
|
|
h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
|
||
|
|
c->stats[DS_RS], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== PD ===\n");
|
||
|
|
h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
|
||
|
|
c->stats[DS_PD], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== HC ===\n");
|
||
|
|
h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
|
||
|
|
c->stats[DS_HC], E_INT, NULL,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== SC ===\n");
|
||
|
|
if (1) {
|
||
|
|
int i2[2] = {0, DS_SC};
|
||
|
|
|
||
|
|
h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)i2,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
} else {
|
||
|
|
// Appears to be no practical benefit to using this method,
|
||
|
|
// but it may work better if we start mixing SC, IN and BB
|
||
|
|
// elements into the same external block.
|
||
|
|
cram_byte_array_len_encoder e;
|
||
|
|
|
||
|
|
e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
|
||
|
|
? E_VARINT_UNSIGNED
|
||
|
|
: E_EXTERNAL;
|
||
|
|
e.len_dat = (void *)DS_SC_len;
|
||
|
|
|
||
|
|
e.val_encoding = E_EXTERNAL;
|
||
|
|
e.val_dat = (void *)DS_SC;
|
||
|
|
|
||
|
|
h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)&e,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
}
|
||
|
|
if (!h->codecs[DS_SC]) goto_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
//fprintf(stderr, "=== IN ===\n");
|
||
|
|
{
|
||
|
|
int i2[2] = {0, DS_IN};
|
||
|
|
h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)i2,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (!h->codecs[DS_IN]) goto_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
|
||
|
|
(void *)DS_QS,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (!h->codecs[DS_QS]) goto_err;
|
||
|
|
{
|
||
|
|
int i2[2] = {0, DS_RN};
|
||
|
|
h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)i2,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
if (!h->codecs[DS_RN]) goto_err;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/* Encode slices */
|
||
|
|
for (i = 0; i < c->curr_slice; i++) {
|
||
|
|
hts_log_info("Encode slice %d", i);
|
||
|
|
|
||
|
|
int local_embed_ref =
|
||
|
|
embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
|
||
|
|
if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Create compression header */
|
||
|
|
{
|
||
|
|
h->ref_seq_id = c->ref_seq_id;
|
||
|
|
h->ref_seq_start = c->ref_seq_start;
|
||
|
|
h->ref_seq_span = c->ref_seq_span;
|
||
|
|
h->num_records = c->num_records;
|
||
|
|
h->qs_seq_orient = c->qs_seq_orient;
|
||
|
|
// slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
|
||
|
|
h->AP_delta = c->pos_sorted;
|
||
|
|
memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
|
||
|
|
|
||
|
|
if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Compute landmarks */
|
||
|
|
/* Fill out slice landmarks */
|
||
|
|
c->num_landmarks = c->curr_slice;
|
||
|
|
c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
|
||
|
|
if (!c->landmark)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Slice offset starts after the first block, so we need to simulate
|
||
|
|
* writing it to work out the correct offset
|
||
|
|
*/
|
||
|
|
{
|
||
|
|
slice_offset = c_hdr->method == RAW
|
||
|
|
? c_hdr->uncomp_size
|
||
|
|
: c_hdr->comp_size;
|
||
|
|
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
|
||
|
|
fd->vv.varint_size(c_hdr->content_id) +
|
||
|
|
fd->vv.varint_size(c_hdr->comp_size) +
|
||
|
|
fd->vv.varint_size(c_hdr->uncomp_size);
|
||
|
|
}
|
||
|
|
|
||
|
|
c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
|
||
|
|
if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
|
||
|
|
// Spec states span=0, but it broke our range queries.
|
||
|
|
// See commit message for this and prior.
|
||
|
|
c->ref_seq_start = 0;
|
||
|
|
c->ref_seq_span = 0;
|
||
|
|
} else {
|
||
|
|
c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
|
||
|
|
c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
|
||
|
|
}
|
||
|
|
for (i = 0; i < c->curr_slice; i++) {
|
||
|
|
cram_slice *s = c->slices[i];
|
||
|
|
|
||
|
|
c->num_blocks += s->hdr->num_blocks + 1; // slice header
|
||
|
|
c->landmark[i] = slice_offset;
|
||
|
|
|
||
|
|
if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
|
||
|
|
c->ref_seq_start + c->ref_seq_span) {
|
||
|
|
c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
|
||
|
|
- c->ref_seq_start;
|
||
|
|
}
|
||
|
|
|
||
|
|
slice_offset += s->hdr_block->method == RAW
|
||
|
|
? s->hdr_block->uncomp_size
|
||
|
|
: s->hdr_block->comp_size;
|
||
|
|
|
||
|
|
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
|
||
|
|
fd->vv.varint_size(s->hdr_block->content_id) +
|
||
|
|
fd->vv.varint_size(s->hdr_block->comp_size) +
|
||
|
|
fd->vv.varint_size(s->hdr_block->uncomp_size);
|
||
|
|
|
||
|
|
for (j = 0; j < s->hdr->num_blocks; j++) {
|
||
|
|
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
|
||
|
|
fd->vv.varint_size(s->block[j]->content_id) +
|
||
|
|
fd->vv.varint_size(s->block[j]->comp_size) +
|
||
|
|
fd->vv.varint_size(s->block[j]->uncomp_size);
|
||
|
|
|
||
|
|
slice_offset += s->block[j]->method == RAW
|
||
|
|
? s->block[j]->uncomp_size
|
||
|
|
: s->block[j]->comp_size;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
c->length += slice_offset; // just past the final slice
|
||
|
|
|
||
|
|
c->comp_hdr_block = c_hdr;
|
||
|
|
|
||
|
|
if (c->ref_seq_id >= 0) {
|
||
|
|
if (c->ref_free) {
|
||
|
|
free(c->ref);
|
||
|
|
c->ref = NULL;
|
||
|
|
} else {
|
||
|
|
cram_ref_decr(fd->refs, c->ref_seq_id);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Cache references up-front if we have unsorted access patterns */
|
||
|
|
if (!no_ref && c->refs_used) {
|
||
|
|
for (i = 0; i < fd->refs->nref; i++) {
|
||
|
|
if (c->refs_used[i])
|
||
|
|
cram_ref_decr(fd->refs, i);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Adds a feature code to a read within a slice. For purposes of minimising
|
||
|
|
* memory allocations and fragmentation we have one array of features for all
|
||
|
|
* reads within the slice. We return the index into this array for this new
|
||
|
|
* feature.
|
||
|
|
*
|
||
|
|
* Returns feature index on success
|
||
|
|
* -1 on failure.
|
||
|
|
*/
|
||
|
|
static int cram_add_feature(cram_container *c, cram_slice *s,
|
||
|
|
cram_record *r, cram_feature *f) {
|
||
|
|
if (s->nfeatures >= s->afeatures) {
|
||
|
|
s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
|
||
|
|
s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
|
||
|
|
if (!s->features)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!r->nfeature++) {
|
||
|
|
r->feature = s->nfeatures;
|
||
|
|
if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
|
||
|
|
return -1;
|
||
|
|
} else {
|
||
|
|
if (cram_stats_add(c->stats[DS_FP],
|
||
|
|
f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
}
|
||
|
|
if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
s->features[s->nfeatures++] = *f;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_substitution(cram_fd *fd, cram_container *c,
|
||
|
|
cram_slice *s, cram_record *r,
|
||
|
|
int pos, char base, char qual, char ref) {
|
||
|
|
cram_feature f;
|
||
|
|
|
||
|
|
// seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
|
||
|
|
if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
|
||
|
|
f.X.pos = pos+1;
|
||
|
|
f.X.code = 'X';
|
||
|
|
f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
|
||
|
|
if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
|
||
|
|
return -1;
|
||
|
|
} else {
|
||
|
|
f.B.pos = pos+1;
|
||
|
|
f.B.code = 'B';
|
||
|
|
f.B.base = base;
|
||
|
|
f.B.qual = qual;
|
||
|
|
if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
|
||
|
|
if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
|
||
|
|
BLOCK_APPEND_CHAR(s->qual_blk, qual);
|
||
|
|
}
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_bases(cram_fd *fd, cram_container *c,
|
||
|
|
cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base) {
|
||
|
|
cram_feature f;
|
||
|
|
|
||
|
|
f.b.pos = pos+1;
|
||
|
|
f.b.code = 'b';
|
||
|
|
f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
|
||
|
|
f.b.len = len;
|
||
|
|
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_base(cram_fd *fd, cram_container *c,
|
||
|
|
cram_slice *s, cram_record *r,
|
||
|
|
int pos, char base, char qual) {
|
||
|
|
cram_feature f;
|
||
|
|
f.B.pos = pos+1;
|
||
|
|
f.B.code = 'B';
|
||
|
|
f.B.base = base;
|
||
|
|
f.B.qual = qual;
|
||
|
|
if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
|
||
|
|
if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
|
||
|
|
BLOCK_APPEND_CHAR(s->qual_blk, qual);
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_quality(cram_fd *fd, cram_container *c,
|
||
|
|
cram_slice *s, cram_record *r,
|
||
|
|
int pos, char qual) {
|
||
|
|
cram_feature f;
|
||
|
|
f.Q.pos = pos+1;
|
||
|
|
f.Q.code = 'Q';
|
||
|
|
f.Q.qual = qual;
|
||
|
|
if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
|
||
|
|
BLOCK_APPEND_CHAR(s->qual_blk, qual);
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base) {
|
||
|
|
cram_feature f;
|
||
|
|
f.D.pos = pos+1;
|
||
|
|
f.D.code = 'D';
|
||
|
|
f.D.len = len;
|
||
|
|
if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base, int version) {
|
||
|
|
cram_feature f;
|
||
|
|
f.S.pos = pos+1;
|
||
|
|
f.S.code = 'S';
|
||
|
|
f.S.len = len;
|
||
|
|
switch (CRAM_MAJOR_VERS(version)) {
|
||
|
|
case 1:
|
||
|
|
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
|
||
|
|
BLOCK_APPEND(s->base_blk, base, len);
|
||
|
|
BLOCK_APPEND_CHAR(s->base_blk, '\0');
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 2:
|
||
|
|
default:
|
||
|
|
f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
|
||
|
|
if (base) {
|
||
|
|
BLOCK_APPEND(s->soft_blk, base, len);
|
||
|
|
} else {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < len; i++)
|
||
|
|
BLOCK_APPEND_CHAR(s->soft_blk, 'N');
|
||
|
|
}
|
||
|
|
BLOCK_APPEND_CHAR(s->soft_blk, '\0');
|
||
|
|
break;
|
||
|
|
|
||
|
|
//default:
|
||
|
|
// // v3.0 onwards uses BB data-series
|
||
|
|
// f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
|
||
|
|
}
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base) {
|
||
|
|
cram_feature f;
|
||
|
|
f.S.pos = pos+1;
|
||
|
|
f.S.code = 'H';
|
||
|
|
f.S.len = len;
|
||
|
|
if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base) {
|
||
|
|
cram_feature f;
|
||
|
|
f.S.pos = pos+1;
|
||
|
|
f.S.code = 'N';
|
||
|
|
f.S.len = len;
|
||
|
|
if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base) {
|
||
|
|
cram_feature f;
|
||
|
|
f.S.pos = pos+1;
|
||
|
|
f.S.code = 'P';
|
||
|
|
f.S.len = len;
|
||
|
|
if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
}
|
||
|
|
|
||
|
|
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
|
||
|
|
int pos, int len, char *base) {
|
||
|
|
cram_feature f;
|
||
|
|
f.I.pos = pos+1;
|
||
|
|
if (len == 1) {
|
||
|
|
char b = base ? *base : 'N';
|
||
|
|
f.i.code = 'i';
|
||
|
|
f.i.base = b;
|
||
|
|
if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
|
||
|
|
} else {
|
||
|
|
f.I.code = 'I';
|
||
|
|
f.I.len = len;
|
||
|
|
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
|
||
|
|
if (base) {
|
||
|
|
BLOCK_APPEND(s->base_blk, base, len);
|
||
|
|
} else {
|
||
|
|
int i;
|
||
|
|
for (i = 0; i < len; i++)
|
||
|
|
BLOCK_APPEND_CHAR(s->base_blk, 'N');
|
||
|
|
}
|
||
|
|
BLOCK_APPEND_CHAR(s->base_blk, '\0');
|
||
|
|
}
|
||
|
|
return cram_add_feature(c, s, r, &f);
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Encodes auxiliary data. Largely duplicated from above, but done so to
|
||
|
|
* keep it simple and avoid a myriad of version ifs.
|
||
|
|
*
|
||
|
|
* Returns the RG header line pointed to by the BAM aux fields on success,
|
||
|
|
* NULL on failure or no rg present, also sets "*err" to non-zero
|
||
|
|
*/
|
||
|
|
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
|
||
|
|
cram_container *c,
|
||
|
|
cram_slice *s, cram_record *cr,
|
||
|
|
int verbatim_NM, int verbatim_MD,
|
||
|
|
int NM, kstring_t *MD, int cf_tag,
|
||
|
|
int no_ref, int *err) {
|
||
|
|
char *aux, *orig;
|
||
|
|
sam_hrec_rg_t *brg = NULL;
|
||
|
|
int aux_size = bam_get_l_aux(b);
|
||
|
|
const char *aux_end = bam_data_end(b);
|
||
|
|
cram_block *td_b = c->comp_hdr->TD_blk;
|
||
|
|
int TD_blk_size = BLOCK_SIZE(td_b), new;
|
||
|
|
char *key;
|
||
|
|
khint_t k;
|
||
|
|
|
||
|
|
if (err) *err = 1;
|
||
|
|
|
||
|
|
orig = aux = (char *)bam_aux(b);
|
||
|
|
|
||
|
|
|
||
|
|
// cF:i => Extra CRAM bit flags.
|
||
|
|
// 1: Don't auto-decode MD (may be invalid)
|
||
|
|
// 2: Don't auto-decode NM (may be invalid)
|
||
|
|
if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
|
||
|
|
// Temporary copy of aux so we can ammend it.
|
||
|
|
aux = malloc(aux_size+4);
|
||
|
|
if (!aux)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
memcpy(aux, orig, aux_size);
|
||
|
|
aux[aux_size++] = 'c';
|
||
|
|
aux[aux_size++] = 'F';
|
||
|
|
aux[aux_size++] = 'C';
|
||
|
|
aux[aux_size++] = cf_tag;
|
||
|
|
orig = aux;
|
||
|
|
aux_end = aux + aux_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Copy aux keys to td_b and aux values to slice aux blocks
|
||
|
|
while (aux_end - aux >= 1 && aux[0] != 0) {
|
||
|
|
int r;
|
||
|
|
|
||
|
|
// Room for code + type + at least 1 byte of data
|
||
|
|
if (aux - orig >= aux_size - 3)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
// RG:Z
|
||
|
|
if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
|
||
|
|
char *rg = &aux[3];
|
||
|
|
aux = rg;
|
||
|
|
while (aux < aux_end && *aux++);
|
||
|
|
if (aux == aux_end && aux[-1] != '\0') {
|
||
|
|
hts_log_error("Unterminated RG:Z tag for read \"%s\"",
|
||
|
|
bam_get_qname(b));
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
|
||
|
|
if (brg) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4)
|
||
|
|
BLOCK_APPEND(td_b, "RG*", 3);
|
||
|
|
continue;
|
||
|
|
} else {
|
||
|
|
// RG:Z tag will be stored verbatim
|
||
|
|
hts_log_warning("Missing @RG header for RG \"%s\"", rg);
|
||
|
|
aux = rg - 3;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// MD:Z
|
||
|
|
if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
|
||
|
|
if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
|
||
|
|
if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
|
||
|
|
while (aux < aux_end && *aux++);
|
||
|
|
if (aux == aux_end && aux[-1] != '\0') {
|
||
|
|
hts_log_error("Unterminated MD:Z tag for read \"%s\"",
|
||
|
|
bam_get_qname(b));
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4)
|
||
|
|
BLOCK_APPEND(td_b, "MD*", 3);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// NM:i
|
||
|
|
if (aux[0] == 'N' && aux[1] == 'M') {
|
||
|
|
if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
|
||
|
|
int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
|
||
|
|
if (NM_ == NM) {
|
||
|
|
switch(aux[2]) {
|
||
|
|
case 'A': case 'C': case 'c': aux+=4; break;
|
||
|
|
case 'S': case 's': aux+=5; break;
|
||
|
|
case 'I': case 'i': case 'f': aux+=7; break;
|
||
|
|
default:
|
||
|
|
hts_log_error("Unhandled type code for NM tag");
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4)
|
||
|
|
BLOCK_APPEND(td_b, "NM*", 3);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
BLOCK_APPEND(td_b, aux, 3);
|
||
|
|
|
||
|
|
// Container level tags_used, for TD series
|
||
|
|
// Maps integer key ('X0i') to cram_tag_map struct.
|
||
|
|
int key = (((unsigned char *) aux)[0]<<16 |
|
||
|
|
((unsigned char *) aux)[1]<<8 |
|
||
|
|
((unsigned char *) aux)[2]);
|
||
|
|
k = kh_put(m_tagmap, c->tags_used, key, &r);
|
||
|
|
if (-1 == r)
|
||
|
|
goto err;
|
||
|
|
else if (r != 0)
|
||
|
|
kh_val(c->tags_used, k) = NULL;
|
||
|
|
|
||
|
|
if (r == 1) {
|
||
|
|
khint_t k_global;
|
||
|
|
|
||
|
|
// Global tags_used for cram_metrics support
|
||
|
|
pthread_mutex_lock(&fd->metrics_lock);
|
||
|
|
k_global = kh_put(m_metrics, fd->tags_used, key, &r);
|
||
|
|
if (-1 == r) {
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
if (r >= 1) {
|
||
|
|
kh_val(fd->tags_used, k_global) = cram_new_metrics();
|
||
|
|
if (!kh_val(fd->tags_used, k_global)) {
|
||
|
|
kh_del(m_metrics, fd->tags_used, k_global);
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
|
||
|
|
int i2[2] = {'\t',key};
|
||
|
|
size_t sk = key;
|
||
|
|
cram_tag_map *m = calloc(1, sizeof(*m));
|
||
|
|
if (!m)
|
||
|
|
goto_err;
|
||
|
|
kh_val(c->tags_used, k) = m;
|
||
|
|
|
||
|
|
cram_codec *c;
|
||
|
|
|
||
|
|
// Use a block content id based on the tag id.
|
||
|
|
// Codec type depends on tag data type.
|
||
|
|
switch(aux[2]) {
|
||
|
|
case 'Z': case 'H':
|
||
|
|
// string as byte_array_stop
|
||
|
|
c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)i2,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'A': case 'c': case 'C': {
|
||
|
|
// byte array len, 1 byte
|
||
|
|
cram_byte_array_len_encoder e;
|
||
|
|
cram_stats st;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) <= 3) {
|
||
|
|
e.len_encoding = E_HUFFMAN;
|
||
|
|
e.len_dat = NULL; // will get codes from st
|
||
|
|
} else {
|
||
|
|
e.len_encoding = E_CONST_INT;
|
||
|
|
e.len_dat = NULL; // will get codes from st
|
||
|
|
}
|
||
|
|
memset(&st, 0, sizeof(st));
|
||
|
|
if (cram_stats_add(&st, 1) < 0) goto block_err;
|
||
|
|
cram_stats_encoding(fd, &st);
|
||
|
|
|
||
|
|
e.val_encoding = E_EXTERNAL;
|
||
|
|
e.val_dat = (void *)sk;
|
||
|
|
|
||
|
|
c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
|
||
|
|
E_BYTE_ARRAY, (void *)&e,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 's': case 'S': {
|
||
|
|
// byte array len, 2 byte
|
||
|
|
cram_byte_array_len_encoder e;
|
||
|
|
cram_stats st;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) <= 3) {
|
||
|
|
e.len_encoding = E_HUFFMAN;
|
||
|
|
e.len_dat = NULL; // will get codes from st
|
||
|
|
} else {
|
||
|
|
e.len_encoding = E_CONST_INT;
|
||
|
|
e.len_dat = NULL; // will get codes from st
|
||
|
|
}
|
||
|
|
memset(&st, 0, sizeof(st));
|
||
|
|
if (cram_stats_add(&st, 2) < 0) goto block_err;
|
||
|
|
cram_stats_encoding(fd, &st);
|
||
|
|
|
||
|
|
e.val_encoding = E_EXTERNAL;
|
||
|
|
e.val_dat = (void *)sk;
|
||
|
|
|
||
|
|
c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
|
||
|
|
E_BYTE_ARRAY, (void *)&e,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
case 'i': case 'I': case 'f': {
|
||
|
|
// byte array len, 4 byte
|
||
|
|
cram_byte_array_len_encoder e;
|
||
|
|
cram_stats st;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) <= 3) {
|
||
|
|
e.len_encoding = E_HUFFMAN;
|
||
|
|
e.len_dat = NULL; // will get codes from st
|
||
|
|
} else {
|
||
|
|
e.len_encoding = E_CONST_INT;
|
||
|
|
e.len_dat = NULL; // will get codes from st
|
||
|
|
}
|
||
|
|
memset(&st, 0, sizeof(st));
|
||
|
|
if (cram_stats_add(&st, 4) < 0) goto block_err;
|
||
|
|
cram_stats_encoding(fd, &st);
|
||
|
|
|
||
|
|
e.val_encoding = E_EXTERNAL;
|
||
|
|
e.val_dat = (void *)sk;
|
||
|
|
|
||
|
|
c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
|
||
|
|
E_BYTE_ARRAY, (void *)&e,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'B': {
|
||
|
|
// Byte array of variable size, but we generate our tag
|
||
|
|
// byte stream at the wrong stage (during reading and not
|
||
|
|
// after slice header construction). So we use
|
||
|
|
// BYTE_ARRAY_LEN with the length codec being external
|
||
|
|
// too.
|
||
|
|
cram_byte_array_len_encoder e;
|
||
|
|
|
||
|
|
e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
|
||
|
|
? E_VARINT_UNSIGNED
|
||
|
|
: E_EXTERNAL;
|
||
|
|
e.len_dat = (void *)sk; // or key+128 for len?
|
||
|
|
|
||
|
|
e.val_encoding = E_EXTERNAL;
|
||
|
|
e.val_dat = (void *)sk;
|
||
|
|
|
||
|
|
c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
|
||
|
|
E_BYTE_ARRAY, (void *)&e,
|
||
|
|
fd->version, &fd->vv);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
|
||
|
|
c = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!c)
|
||
|
|
goto_err;
|
||
|
|
|
||
|
|
m->codec = c;
|
||
|
|
|
||
|
|
// Link to fd-global tag metrics
|
||
|
|
pthread_mutex_lock(&fd->metrics_lock);
|
||
|
|
m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
}
|
||
|
|
|
||
|
|
cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
|
||
|
|
if (!tm) goto_err;
|
||
|
|
cram_codec *codec = tm->codec;
|
||
|
|
if (!tm->codec) goto_err;
|
||
|
|
|
||
|
|
switch(aux[2]) {
|
||
|
|
case 'A': case 'C': case 'c':
|
||
|
|
if (aux_end - aux < 3+1)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (!tm->blk) {
|
||
|
|
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
|
||
|
|
goto err;
|
||
|
|
codec->u.e_byte_array_len.val_codec->out = tm->blk;
|
||
|
|
}
|
||
|
|
|
||
|
|
aux+=3;
|
||
|
|
//codec->encode(s, codec, aux, 1);
|
||
|
|
// Functionally equivalent, but less code.
|
||
|
|
BLOCK_APPEND_CHAR(tm->blk, *aux);
|
||
|
|
aux++;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'S': case 's':
|
||
|
|
if (aux_end - aux < 3+2)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (!tm->blk) {
|
||
|
|
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
|
||
|
|
goto err;
|
||
|
|
codec->u.e_byte_array_len.val_codec->out = tm->blk;
|
||
|
|
}
|
||
|
|
|
||
|
|
aux+=3;
|
||
|
|
//codec->encode(s, codec, aux, 2);
|
||
|
|
BLOCK_APPEND(tm->blk, aux, 2);
|
||
|
|
aux+=2;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'I': case 'i': case 'f':
|
||
|
|
if (aux_end - aux < 3+4)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (!tm->blk) {
|
||
|
|
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
|
||
|
|
goto err;
|
||
|
|
codec->u.e_byte_array_len.val_codec->out = tm->blk;
|
||
|
|
}
|
||
|
|
|
||
|
|
aux+=3;
|
||
|
|
//codec->encode(s, codec, aux, 4);
|
||
|
|
BLOCK_APPEND(tm->blk, aux, 4);
|
||
|
|
aux+=4;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'd':
|
||
|
|
if (aux_end - aux < 3+8)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (!tm->blk) {
|
||
|
|
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
|
||
|
|
goto err;
|
||
|
|
codec->u.e_byte_array_len.val_codec->out = tm->blk;
|
||
|
|
}
|
||
|
|
|
||
|
|
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
|
||
|
|
//codec->encode(s, codec, aux, 8);
|
||
|
|
BLOCK_APPEND(tm->blk, aux, 8);
|
||
|
|
aux+=8;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case 'Z': case 'H': {
|
||
|
|
if (aux_end - aux < 3)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (!tm->blk) {
|
||
|
|
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
|
||
|
|
goto err;
|
||
|
|
codec->out = tm->blk;
|
||
|
|
}
|
||
|
|
|
||
|
|
char *aux_s;
|
||
|
|
aux += 3;
|
||
|
|
aux_s = aux;
|
||
|
|
while (aux < aux_end && *aux++);
|
||
|
|
if (aux == aux_end && aux[-1] != '\0') {
|
||
|
|
hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
|
||
|
|
aux_s[-3], aux_s[-2], aux_s[-1],
|
||
|
|
bam_get_qname(b));
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
|
||
|
|
goto err;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
case 'B': {
|
||
|
|
if (aux_end - aux < 4+4)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
int type = aux[3];
|
||
|
|
uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
|
||
|
|
((uint64_t)((unsigned char *)aux)[5]) << 8 |
|
||
|
|
((uint64_t)((unsigned char *)aux)[6]) <<16 |
|
||
|
|
((uint64_t)((unsigned char *)aux)[7]) <<24);
|
||
|
|
uint64_t blen;
|
||
|
|
if (!tm->blk) {
|
||
|
|
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
|
||
|
|
goto err;
|
||
|
|
if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
|
||
|
|
if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
|
||
|
|
goto err;
|
||
|
|
codec->u.e_byte_array_len.len_codec->out = tm->blk2;
|
||
|
|
codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
|
||
|
|
} else {
|
||
|
|
codec->u.e_byte_array_len.len_codec->out = tm->blk;
|
||
|
|
codec->u.e_byte_array_len.val_codec->out = tm->blk;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// skip TN field
|
||
|
|
aux+=3;
|
||
|
|
|
||
|
|
// We use BYTE_ARRAY_LEN with external length, so store that first
|
||
|
|
switch (type) {
|
||
|
|
case 'c': case 'C':
|
||
|
|
blen = count;
|
||
|
|
break;
|
||
|
|
case 's': case 'S':
|
||
|
|
blen = 2*count;
|
||
|
|
break;
|
||
|
|
case 'i': case 'I': case 'f':
|
||
|
|
blen = 4*count;
|
||
|
|
break;
|
||
|
|
default:
|
||
|
|
hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
|
||
|
|
blen += 5; // sub-type & length
|
||
|
|
if (aux_end - aux < blen || blen > INT_MAX)
|
||
|
|
goto err;
|
||
|
|
|
||
|
|
if (codec->encode(s, codec, aux, (int) blen) < 0)
|
||
|
|
goto err;
|
||
|
|
aux += blen;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
tm->blk->m = tm->m;
|
||
|
|
}
|
||
|
|
|
||
|
|
// FIXME: sort BLOCK_DATA(td_b) by char[3] triples
|
||
|
|
|
||
|
|
// And and increment TD hash entry
|
||
|
|
BLOCK_APPEND_CHAR(td_b, 0);
|
||
|
|
|
||
|
|
// Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
|
||
|
|
key = string_ndup(c->comp_hdr->TD_keys,
|
||
|
|
(char *)BLOCK_DATA(td_b) + TD_blk_size,
|
||
|
|
BLOCK_SIZE(td_b) - TD_blk_size);
|
||
|
|
if (!key)
|
||
|
|
goto block_err;
|
||
|
|
k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
|
||
|
|
if (new < 0) {
|
||
|
|
goto err;
|
||
|
|
} else if (new == 0) {
|
||
|
|
BLOCK_SIZE(td_b) = TD_blk_size;
|
||
|
|
} else {
|
||
|
|
kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
|
||
|
|
c->comp_hdr->nTL++;
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->TL = kh_val(c->comp_hdr->TD_hash, k);
|
||
|
|
if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
if (orig != (char *)bam_aux(b))
|
||
|
|
free(orig);
|
||
|
|
|
||
|
|
if (err) *err = 0;
|
||
|
|
|
||
|
|
return brg;
|
||
|
|
|
||
|
|
err:
|
||
|
|
block_err:
|
||
|
|
if (orig != (char *)bam_aux(b))
|
||
|
|
free(orig);
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* During cram_next_container or before the final flush at end of
|
||
|
|
* file, we update the current slice headers and increment the slice
|
||
|
|
* number to the next slice.
|
||
|
|
*
|
||
|
|
* See cram_next_container() and cram_close().
|
||
|
|
*/
|
||
|
|
void cram_update_curr_slice(cram_container *c, int version) {
|
||
|
|
cram_slice *s = c->slice;
|
||
|
|
if (c->multi_seq) {
|
||
|
|
s->hdr->ref_seq_id = -2;
|
||
|
|
s->hdr->ref_seq_start = 0;
|
||
|
|
s->hdr->ref_seq_span = 0;
|
||
|
|
} else if (c->curr_ref == -1 && CRAM_ge31(version)) {
|
||
|
|
// Spec states span=0, but it broke our range queries.
|
||
|
|
// See commit message for this and prior.
|
||
|
|
s->hdr->ref_seq_id = -1;
|
||
|
|
s->hdr->ref_seq_start = 0;
|
||
|
|
s->hdr->ref_seq_span = 0;
|
||
|
|
} else {
|
||
|
|
s->hdr->ref_seq_id = c->curr_ref;
|
||
|
|
s->hdr->ref_seq_start = c->first_base;
|
||
|
|
s->hdr->ref_seq_span = MAX(0, c->last_base - c->first_base + 1);
|
||
|
|
}
|
||
|
|
s->hdr->num_records = c->curr_rec;
|
||
|
|
|
||
|
|
if (c->curr_slice == 0) {
|
||
|
|
if (c->ref_seq_id != s->hdr->ref_seq_id)
|
||
|
|
c->ref_seq_id = s->hdr->ref_seq_id;
|
||
|
|
c->ref_seq_start = c->first_base;
|
||
|
|
}
|
||
|
|
|
||
|
|
c->curr_slice++;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Handles creation of a new container or new slice, flushing any
|
||
|
|
* existing containers when appropriate.
|
||
|
|
*
|
||
|
|
* Really this is next slice, which may or may not lead to a new container.
|
||
|
|
*
|
||
|
|
* Returns cram_container pointer on success
|
||
|
|
* NULL on failure.
|
||
|
|
*/
|
||
|
|
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
|
||
|
|
cram_container *c = fd->ctr;
|
||
|
|
int i;
|
||
|
|
|
||
|
|
/* First occurrence */
|
||
|
|
if (c->curr_ref == -2)
|
||
|
|
c->curr_ref = bam_ref(b);
|
||
|
|
|
||
|
|
if (c->slice)
|
||
|
|
cram_update_curr_slice(c, fd->version);
|
||
|
|
|
||
|
|
/* Flush container */
|
||
|
|
if (c->curr_slice == c->max_slice ||
|
||
|
|
(bam_ref(b) != c->curr_ref && !c->multi_seq)) {
|
||
|
|
c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
|
||
|
|
hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
|
||
|
|
c->ref_seq_id, c->ref_seq_start,
|
||
|
|
c->ref_seq_start + c->ref_seq_span -1);
|
||
|
|
|
||
|
|
/* Encode slices */
|
||
|
|
if (-1 == cram_flush_container_mt(fd, c))
|
||
|
|
return NULL;
|
||
|
|
if (!fd->pool) {
|
||
|
|
// Move to sep func, as we need cram_flush_container for
|
||
|
|
// the closing phase to flush the partial container.
|
||
|
|
for (i = 0; i < c->max_slice; i++) {
|
||
|
|
cram_free_slice(c->slices[i]);
|
||
|
|
c->slices[i] = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
c->slice = NULL;
|
||
|
|
c->curr_slice = 0;
|
||
|
|
|
||
|
|
/* Easy approach for purposes of freeing stats */
|
||
|
|
cram_free_container(c);
|
||
|
|
}
|
||
|
|
|
||
|
|
c = fd->ctr = cram_new_container(fd->seqs_per_slice,
|
||
|
|
fd->slices_per_container);
|
||
|
|
if (!c)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
c->no_ref = fd->no_ref;
|
||
|
|
c->embed_ref = fd->embed_ref;
|
||
|
|
c->record_counter = fd->record_counter;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
c->curr_ref = bam_ref(b);
|
||
|
|
}
|
||
|
|
|
||
|
|
c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
|
||
|
|
|
||
|
|
/* New slice */
|
||
|
|
c->slice = c->slices[c->curr_slice] =
|
||
|
|
cram_new_slice(MAPPED_SLICE, c->max_rec);
|
||
|
|
if (!c->slice)
|
||
|
|
return NULL;
|
||
|
|
|
||
|
|
if (c->multi_seq) {
|
||
|
|
c->slice->hdr->ref_seq_id = -2;
|
||
|
|
c->slice->hdr->ref_seq_start = 0;
|
||
|
|
c->slice->last_apos = 1;
|
||
|
|
} else {
|
||
|
|
c->slice->hdr->ref_seq_id = bam_ref(b);
|
||
|
|
// wrong for unsorted data, will fix during encoding.
|
||
|
|
c->slice->hdr->ref_seq_start = bam_pos(b)+1;
|
||
|
|
c->slice->last_apos = bam_pos(b)+1;
|
||
|
|
}
|
||
|
|
|
||
|
|
c->curr_rec = 0;
|
||
|
|
c->s_num_bases = 0;
|
||
|
|
c->n_mapped = 0;
|
||
|
|
|
||
|
|
// QO field: 0 implies original orientation, 1 implies sequence orientation
|
||
|
|
// 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
|
||
|
|
// Conversely other data sets it's often better than 1% saving for 0.
|
||
|
|
// Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
|
||
|
|
c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
|
||
|
|
|
||
|
|
return c;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Converts a single bam record into a cram record.
|
||
|
|
* Possibly used within a thread.
|
||
|
|
*
|
||
|
|
* Returns 0 on success;
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
static int process_one_read(cram_fd *fd, cram_container *c,
|
||
|
|
cram_slice *s, cram_record *cr,
|
||
|
|
bam_seq_t *b, int rnum, kstring_t *MD,
|
||
|
|
int embed_ref, int no_ref) {
|
||
|
|
int i, fake_qual = -1, NM = 0;
|
||
|
|
char *cp;
|
||
|
|
char *ref, *seq, *qual;
|
||
|
|
|
||
|
|
// Any places with N in seq and/or reference can lead to ambiguous
|
||
|
|
// interpretation of the SAM NM:i tag. So we store these verbatim
|
||
|
|
// to ensure valid data round-trips the same regardless of who
|
||
|
|
// defines it as valid.
|
||
|
|
// Similarly when alignments go beyond end of the reference.
|
||
|
|
int verbatim_NM = fd->store_nm;
|
||
|
|
int verbatim_MD = fd->store_md;
|
||
|
|
|
||
|
|
// FIXME: multi-ref containers
|
||
|
|
|
||
|
|
cr->flags = bam_flag(b);
|
||
|
|
cr->len = bam_seq_len(b);
|
||
|
|
uint8_t *md;
|
||
|
|
if (!(md = bam_aux_get(b, "MD")))
|
||
|
|
MD = NULL;
|
||
|
|
else
|
||
|
|
MD->l = 0;
|
||
|
|
|
||
|
|
int cf_tag = 0;
|
||
|
|
|
||
|
|
if (embed_ref == 2) {
|
||
|
|
cf_tag = MD ? 0 : 1; // No MD
|
||
|
|
cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
|
||
|
|
}
|
||
|
|
|
||
|
|
//fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
|
||
|
|
|
||
|
|
ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
|
||
|
|
cr->ref_id = bam_ref(b);
|
||
|
|
if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
|
||
|
|
goto block_err;
|
||
|
|
if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
// Non reference based encoding means storing the bases verbatim as features, which in
|
||
|
|
// turn means every base also has a quality already stored.
|
||
|
|
if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
|
||
|
|
cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
|
||
|
|
|
||
|
|
if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
|
||
|
|
cr->cram_flags |= CRAM_FLAG_NO_SEQ;
|
||
|
|
//cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
|
||
|
|
|
||
|
|
c->num_bases += cr->len;
|
||
|
|
cr->apos = bam_pos(b)+1;
|
||
|
|
if (cr->apos < 0 || cr->apos > INT64_MAX/2)
|
||
|
|
goto err;
|
||
|
|
if (c->pos_sorted) {
|
||
|
|
if (cr->apos < s->last_apos && !fd->ap_delta) {
|
||
|
|
c->pos_sorted = 0;
|
||
|
|
} else {
|
||
|
|
if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
|
||
|
|
goto block_err;
|
||
|
|
s->last_apos = cr->apos;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
//cram_stats_add(c->stats[DS_AP], cr->apos);
|
||
|
|
}
|
||
|
|
c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This seqs_ds is largely pointless and it could reuse the same memory
|
||
|
|
* over and over.
|
||
|
|
* s->base_blk is what we need for encoding.
|
||
|
|
*/
|
||
|
|
cr->seq = BLOCK_SIZE(s->seqs_blk);
|
||
|
|
cr->qual = BLOCK_SIZE(s->qual_blk);
|
||
|
|
BLOCK_GROW(s->seqs_blk, cr->len+1);
|
||
|
|
BLOCK_GROW(s->qual_blk, cr->len);
|
||
|
|
|
||
|
|
// Convert BAM nibble encoded sequence to string of base pairs
|
||
|
|
seq = cp = (char *)BLOCK_END(s->seqs_blk);
|
||
|
|
*seq = 0;
|
||
|
|
nibble2base(bam_seq(b), cp, cr->len);
|
||
|
|
BLOCK_SIZE(s->seqs_blk) += cr->len;
|
||
|
|
|
||
|
|
qual = cp = (char *)bam_qual(b);
|
||
|
|
|
||
|
|
|
||
|
|
/* Copy and parse */
|
||
|
|
if (!(cr->flags & BAM_FUNMAP)) {
|
||
|
|
uint32_t *cig_to, *cig_from;
|
||
|
|
int64_t apos = cr->apos-1, spos = 0;
|
||
|
|
int64_t MD_last = apos; // last position of edit in MD tag
|
||
|
|
|
||
|
|
if (apos < 0) {
|
||
|
|
hts_log_error("Mapped read with position <= 0 is disallowed");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->cigar = s->ncigar;
|
||
|
|
cr->ncigar = bam_cigar_len(b);
|
||
|
|
while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
|
||
|
|
s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
|
||
|
|
s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
|
||
|
|
if (!s->cigar)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
cig_to = (uint32_t *)s->cigar;
|
||
|
|
cig_from = (uint32_t *)bam_cigar(b);
|
||
|
|
|
||
|
|
cr->feature = 0;
|
||
|
|
cr->nfeature = 0;
|
||
|
|
for (i = 0; i < cr->ncigar; i++) {
|
||
|
|
enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
|
||
|
|
uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
|
||
|
|
cig_to[i] = cig_from[i];
|
||
|
|
|
||
|
|
/* Can also generate events from here for CRAM diffs */
|
||
|
|
|
||
|
|
switch (cig_op) {
|
||
|
|
int l;
|
||
|
|
|
||
|
|
// Don't trust = and X ops to be correct.
|
||
|
|
case BAM_CMATCH:
|
||
|
|
case BAM_CBASE_MATCH:
|
||
|
|
case BAM_CBASE_MISMATCH:
|
||
|
|
//fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
|
||
|
|
// cig_len, &ref[apos], cig_len, &seq[spos]);
|
||
|
|
l = 0;
|
||
|
|
if (!no_ref && cr->len) {
|
||
|
|
int end = cig_len+apos < c->ref_end
|
||
|
|
? cig_len : c->ref_end - apos;
|
||
|
|
char *sp = &seq[spos];
|
||
|
|
char *rp = &ref[apos];
|
||
|
|
char *qp = &qual[spos];
|
||
|
|
if (end > cr->len) {
|
||
|
|
hts_log_error("CIGAR and query sequence are of different length");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
for (l = 0; l < end; l++) {
|
||
|
|
// This case is just too disputed and different tools
|
||
|
|
// interpret these in different ways. We give up and
|
||
|
|
// store verbatim.
|
||
|
|
if (rp[l] == 'N' && sp[l] == 'N')
|
||
|
|
verbatim_NM = verbatim_MD = 1;
|
||
|
|
if (rp[l] != sp[l]) {
|
||
|
|
// Build our own MD tag if one is on the sequence, so
|
||
|
|
// we can ensure it matches and thus can be discarded.
|
||
|
|
if (MD && ref) {
|
||
|
|
if (kputuw(apos+l - MD_last, MD) < 0) goto err;
|
||
|
|
if (kputc(rp[l], MD) < 0) goto err;
|
||
|
|
MD_last = apos+l+1;
|
||
|
|
}
|
||
|
|
NM++;
|
||
|
|
if (!sp[l])
|
||
|
|
break;
|
||
|
|
if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
|
||
|
|
#if 0
|
||
|
|
// Disabled for the time being as it doesn't
|
||
|
|
// seem to gain us much.
|
||
|
|
int ol=l;
|
||
|
|
while (l<end && rp[l] != sp[l])
|
||
|
|
l++;
|
||
|
|
if (l-ol > 1) {
|
||
|
|
if (cram_add_bases(fd, c, s, cr, spos+ol,
|
||
|
|
l-ol, &seq[spos+ol]))
|
||
|
|
return -1;
|
||
|
|
l--;
|
||
|
|
} else {
|
||
|
|
l = ol;
|
||
|
|
if (cram_add_substitution(fd, c, s, cr,
|
||
|
|
spos+l, sp[l],
|
||
|
|
qp[l], rp[l]))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
// With urmap pushed to the limit and lots
|
||
|
|
// of unaligned data (should be soft-clipped)
|
||
|
|
// this saves ~2-7%. Worth it?
|
||
|
|
int nl = l;
|
||
|
|
int max_end = nl, max_score = 0, score = 0;
|
||
|
|
while (nl < end) {
|
||
|
|
if (rp[nl] != sp[nl]) {
|
||
|
|
score += 3;
|
||
|
|
if (max_score < score) {
|
||
|
|
max_score = score;
|
||
|
|
max_end = nl;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
score--;
|
||
|
|
if (score < -2 ||
|
||
|
|
max_score - score > 7)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
nl++;
|
||
|
|
}
|
||
|
|
if (max_score > 20) {
|
||
|
|
cram_add_bases(fd, c, s, cr, spos+l,
|
||
|
|
max_end-l, &seq[spos+l]);
|
||
|
|
l = max_end-1;
|
||
|
|
} else {
|
||
|
|
while (l < nl) {
|
||
|
|
if (rp[l] != sp[l])
|
||
|
|
cram_add_substitution(fd, c, s,
|
||
|
|
cr, spos+l,
|
||
|
|
sp[l], qp[l],
|
||
|
|
rp[l]);
|
||
|
|
l++;
|
||
|
|
}
|
||
|
|
l--;
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
} else {
|
||
|
|
if (cram_add_substitution(fd, c, s, cr, spos+l,
|
||
|
|
sp[l], qp[l], rp[l]))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
spos += l;
|
||
|
|
apos += l;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (l < cig_len && cr->len) {
|
||
|
|
if (no_ref) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 3) {
|
||
|
|
if (cram_add_bases(fd, c, s, cr, spos,
|
||
|
|
cig_len-l, &seq[spos]))
|
||
|
|
return -1;
|
||
|
|
spos += cig_len-l;
|
||
|
|
} else {
|
||
|
|
for (; l < cig_len && seq[spos]; l++, spos++) {
|
||
|
|
if (cram_add_base(fd, c, s, cr, spos,
|
||
|
|
seq[spos], qual[spos]))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
/* off end of sequence or non-ref based output */
|
||
|
|
verbatim_NM = verbatim_MD = 1;
|
||
|
|
for (; l < cig_len && seq[spos]; l++, spos++) {
|
||
|
|
if (cram_add_base(fd, c, s, cr, spos,
|
||
|
|
seq[spos], qual[spos]))
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
apos += cig_len;
|
||
|
|
} else if (!cr->len) {
|
||
|
|
/* Seq "*" */
|
||
|
|
verbatim_NM = verbatim_MD = 1;
|
||
|
|
apos += cig_len;
|
||
|
|
spos += cig_len;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CDEL:
|
||
|
|
if (MD && ref) {
|
||
|
|
if (kputuw(apos - MD_last, MD) < 0) goto err;
|
||
|
|
if (apos < c->ref_end) {
|
||
|
|
if (kputc_('^', MD) < 0) goto err;
|
||
|
|
if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
|
||
|
|
goto err;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
NM += cig_len;
|
||
|
|
|
||
|
|
if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
|
||
|
|
return -1;
|
||
|
|
apos += cig_len;
|
||
|
|
MD_last = apos;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CREF_SKIP:
|
||
|
|
if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
|
||
|
|
return -1;
|
||
|
|
apos += cig_len;
|
||
|
|
MD_last += cig_len;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CINS:
|
||
|
|
if (cram_add_insertion(c, s, cr, spos, cig_len,
|
||
|
|
cr->len ? &seq[spos] : NULL))
|
||
|
|
return -1;
|
||
|
|
if (no_ref && cr->len) {
|
||
|
|
for (l = 0; l < cig_len; l++, spos++) {
|
||
|
|
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
spos += cig_len;
|
||
|
|
}
|
||
|
|
NM += cig_len;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CSOFT_CLIP:
|
||
|
|
if (cram_add_softclip(c, s, cr, spos, cig_len,
|
||
|
|
cr->len ? &seq[spos] : NULL,
|
||
|
|
fd->version))
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
if (no_ref &&
|
||
|
|
!(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
||
|
|
if (cr->len) {
|
||
|
|
for (l = 0; l < cig_len; l++, spos++) {
|
||
|
|
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
for (l = 0; l < cig_len; l++, spos++) {
|
||
|
|
cram_add_quality(fd, c, s, cr, spos, -1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
spos += cig_len;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CHARD_CLIP:
|
||
|
|
if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
|
||
|
|
return -1;
|
||
|
|
break;
|
||
|
|
|
||
|
|
case BAM_CPAD:
|
||
|
|
if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
|
||
|
|
return -1;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
hts_log_error("Unknown CIGAR op code %d", cig_op);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (cr->len && spos != cr->len) {
|
||
|
|
hts_log_error("CIGAR and query sequence are of different length");
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
fake_qual = spos;
|
||
|
|
cr->aend = no_ref ? apos : MIN(apos, c->ref_end);
|
||
|
|
if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
if (MD && ref)
|
||
|
|
if (kputuw(apos - MD_last, MD) < 0) goto err;
|
||
|
|
} else {
|
||
|
|
// Unmapped
|
||
|
|
cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
|
||
|
|
cr->cigar = 0;
|
||
|
|
cr->ncigar = 0;
|
||
|
|
cr->nfeature = 0;
|
||
|
|
cr->aend = MIN(cr->apos, c->ref_end);
|
||
|
|
for (i = 0; i < cr->len; i++)
|
||
|
|
if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
|
||
|
|
goto block_err;
|
||
|
|
fake_qual = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
|
||
|
|
int err = 0;
|
||
|
|
sam_hrec_rg_t *brg =
|
||
|
|
cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
|
||
|
|
cf_tag, no_ref, &err);
|
||
|
|
if (err)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
/* Read group, identified earlier */
|
||
|
|
if (brg) {
|
||
|
|
cr->rg = brg->id;
|
||
|
|
} else if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
||
|
|
sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
|
||
|
|
if (!brg) goto block_err;
|
||
|
|
cr->rg = brg->id;
|
||
|
|
} else {
|
||
|
|
cr->rg = -1;
|
||
|
|
}
|
||
|
|
if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Append to the qual block now. We do this here as
|
||
|
|
* cram_add_substitution() can generate BA/QS events which need to
|
||
|
|
* be in the qual block before we append the rest of the data.
|
||
|
|
*/
|
||
|
|
if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
|
||
|
|
/* Special case of seq "*" */
|
||
|
|
if (cr->len == 0) {
|
||
|
|
cr->len = fake_qual;
|
||
|
|
BLOCK_GROW(s->qual_blk, cr->len);
|
||
|
|
cp = (char *)BLOCK_END(s->qual_blk);
|
||
|
|
memset(cp, 255, cr->len);
|
||
|
|
} else {
|
||
|
|
BLOCK_GROW(s->qual_blk, cr->len);
|
||
|
|
cp = (char *)BLOCK_END(s->qual_blk);
|
||
|
|
char *from = (char *)&bam_qual(b)[0];
|
||
|
|
char *to = &cp[0];
|
||
|
|
memcpy(to, from, cr->len);
|
||
|
|
|
||
|
|
// Store quality in original orientation for better compression.
|
||
|
|
if (!c->qs_seq_orient) {
|
||
|
|
if (cr->flags & BAM_FREVERSE) {
|
||
|
|
int i, j;
|
||
|
|
for (i = 0, j = cr->len-1; i < j; i++, j--) {
|
||
|
|
unsigned char c;
|
||
|
|
c = to[i];
|
||
|
|
to[i] = to[j];
|
||
|
|
to[j] = c;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
BLOCK_SIZE(s->qual_blk) += cr->len;
|
||
|
|
} else {
|
||
|
|
if (cr->len == 0)
|
||
|
|
cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
/* Now we know apos and aend both, update mate-pair information */
|
||
|
|
{
|
||
|
|
int new;
|
||
|
|
khint_t k;
|
||
|
|
int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
|
||
|
|
|
||
|
|
//fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
|
||
|
|
// cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
|
||
|
|
if (cr->flags & BAM_FPAIRED) {
|
||
|
|
char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
|
||
|
|
if (!key)
|
||
|
|
return -1;
|
||
|
|
|
||
|
|
k = kh_put(m_s2i, s->pair[sec], key, &new);
|
||
|
|
if (-1 == new)
|
||
|
|
return -1;
|
||
|
|
else if (new > 0)
|
||
|
|
kh_val(s->pair[sec], k) = rnum;
|
||
|
|
} else {
|
||
|
|
new = 1;
|
||
|
|
k = 0; // Prevents false-positive warning from gcc -Og
|
||
|
|
}
|
||
|
|
|
||
|
|
if (new == 0) {
|
||
|
|
cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
|
||
|
|
int64_t aleft, aright;
|
||
|
|
int sign;
|
||
|
|
|
||
|
|
aleft = MIN(cr->apos, p->apos);
|
||
|
|
aright = MAX(cr->aend, p->aend);
|
||
|
|
if (cr->apos < p->apos) {
|
||
|
|
sign = 1;
|
||
|
|
} else if (cr->apos > p->apos) {
|
||
|
|
sign = -1;
|
||
|
|
} else if (cr->flags & BAM_FREAD1) {
|
||
|
|
sign = 1;
|
||
|
|
} else {
|
||
|
|
sign = -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
|
||
|
|
// a small amount, if appropriate options set.
|
||
|
|
if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
|
||
|
|
!(fd->tlen_zero && bam_mate_pos(b) == 0))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
|
||
|
|
((p->flags & BAM_FUNMAP) != 0))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
|
||
|
|
((p->flags & BAM_FREVERSE) != 0))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
|
||
|
|
// p vs this: tlen, matepos, flags
|
||
|
|
if (p->ref_id != cr->ref_id &&
|
||
|
|
!(fd->tlen_zero && p->ref_id == -1))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
if (p->mate_pos != cr->apos &&
|
||
|
|
!(fd->tlen_zero && p->mate_pos == 0))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
if (((p->flags & BAM_FMUNMAP) != 0) !=
|
||
|
|
((p->mate_flags & CRAM_M_UNMAP) != 0))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
if (((p->flags & BAM_FMREVERSE) != 0) !=
|
||
|
|
((p->mate_flags & CRAM_M_REVERSE) != 0))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
// Supplementary reads are just too ill defined
|
||
|
|
if ((cr->flags & BAM_FSUPPLEMENTARY) ||
|
||
|
|
(p->flags & BAM_FSUPPLEMENTARY))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
// When in lossy name mode, if a read isn't detached we
|
||
|
|
// cannot store the name. The corollary is that when we
|
||
|
|
// must store the name, it must be detached (inefficient).
|
||
|
|
if (fd->lossy_read_names &&
|
||
|
|
(!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
|
||
|
|
!((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
|
||
|
|
goto detached;
|
||
|
|
|
||
|
|
// Now check TLEN. We do this last as sometimes it's the
|
||
|
|
// only thing that differs. In CRAM4 we have a better way
|
||
|
|
// of handling this that doesn't break detached status
|
||
|
|
int explicit_tlen = 0;
|
||
|
|
int tflag1 = ((bam_ins_size(b) &&
|
||
|
|
llabs(bam_ins_size(b) - sign*(aright-aleft+1))
|
||
|
|
> fd->tlen_approx)
|
||
|
|
|| (!bam_ins_size(b) && !fd->tlen_zero));
|
||
|
|
|
||
|
|
int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
|
||
|
|
> fd->tlen_approx)
|
||
|
|
|| (!p->tlen && !fd->tlen_zero));
|
||
|
|
|
||
|
|
if (tflag1 || tflag2) {
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
|
||
|
|
explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
|
||
|
|
} else {
|
||
|
|
// Stil do detached for unmapped data in CRAM4 as this
|
||
|
|
// also impacts RNEXT calculation.
|
||
|
|
goto detached;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* The fields below are unused when encoding this read as it is
|
||
|
|
* no longer detached. In theory they may get referred to when
|
||
|
|
* processing a 3rd or 4th read in this template?, so we set them
|
||
|
|
* here just to be sure.
|
||
|
|
*
|
||
|
|
* They do not need cram_stats_add() calls those as they are
|
||
|
|
* not emitted.
|
||
|
|
*/
|
||
|
|
cr->mate_pos = p->apos;
|
||
|
|
cram_stats_add(c->stats[DS_NP], cr->mate_pos);
|
||
|
|
cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
|
||
|
|
cram_stats_add(c->stats[DS_TS], cr->tlen);
|
||
|
|
cr->mate_flags =
|
||
|
|
((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
|
||
|
|
((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
|
||
|
|
|
||
|
|
// Decrement statistics aggregated earlier
|
||
|
|
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
|
||
|
|
cram_stats_del(c->stats[DS_NP], p->mate_pos);
|
||
|
|
cram_stats_del(c->stats[DS_MF], p->mate_flags);
|
||
|
|
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
|
||
|
|
cram_stats_del(c->stats[DS_TS], p->tlen);
|
||
|
|
cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Similarly we could correct the p-> values too, but these will no
|
||
|
|
* longer have any code that refers back to them as the new 'p'
|
||
|
|
* for this template is our current 'cr'.
|
||
|
|
*/
|
||
|
|
//p->mate_pos = cr->apos;
|
||
|
|
//p->mate_flags =
|
||
|
|
// ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
|
||
|
|
// ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
|
||
|
|
//p->tlen = p->apos - cr->aend;
|
||
|
|
|
||
|
|
// Clear detached from cr flags
|
||
|
|
cr->cram_flags &= ~CRAM_FLAG_DETACHED;
|
||
|
|
cr->cram_flags |= explicit_tlen;
|
||
|
|
if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
// Clear detached from p flags and set downstream
|
||
|
|
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
|
||
|
|
cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
|
||
|
|
p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
|
||
|
|
}
|
||
|
|
|
||
|
|
p->cram_flags &= ~CRAM_FLAG_DETACHED;
|
||
|
|
p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
|
||
|
|
if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
|
||
|
|
if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
kh_val(s->pair[sec], k) = rnum;
|
||
|
|
} else {
|
||
|
|
detached:
|
||
|
|
//fprintf(stderr, "unpaired\n");
|
||
|
|
|
||
|
|
/* Derive mate flags from this flag */
|
||
|
|
cr->mate_flags = 0;
|
||
|
|
if (bam_flag(b) & BAM_FMUNMAP)
|
||
|
|
cr->mate_flags |= CRAM_M_UNMAP;
|
||
|
|
if (bam_flag(b) & BAM_FMREVERSE)
|
||
|
|
cr->mate_flags |= CRAM_M_REVERSE;
|
||
|
|
|
||
|
|
if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
|
||
|
|
if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
cr->tlen = bam_ins_size(b);
|
||
|
|
if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
cr->cram_flags |= CRAM_FLAG_DETACHED;
|
||
|
|
if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
|
||
|
|
goto block_err;
|
||
|
|
if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cr->mqual = bam_map_qual(b);
|
||
|
|
if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
|
||
|
|
goto block_err;
|
||
|
|
|
||
|
|
cr->mate_ref_id = bam_mate_ref(b);
|
||
|
|
|
||
|
|
if (!(bam_flag(b) & BAM_FUNMAP)) {
|
||
|
|
if (c->first_base > cr->apos)
|
||
|
|
c->first_base = cr->apos;
|
||
|
|
|
||
|
|
if (c->last_base < cr->aend)
|
||
|
|
c->last_base = cr->aend;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
block_err:
|
||
|
|
err:
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Write iterator: put BAM format sequences into a CRAM file.
|
||
|
|
* We buffer up a containers worth of data at a time.
|
||
|
|
*
|
||
|
|
* Returns 0 on success
|
||
|
|
* -1 on failure
|
||
|
|
*/
|
||
|
|
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
|
||
|
|
cram_container *c;
|
||
|
|
|
||
|
|
if (!fd->ctr) {
|
||
|
|
fd->ctr = cram_new_container(fd->seqs_per_slice,
|
||
|
|
fd->slices_per_container);
|
||
|
|
if (!fd->ctr)
|
||
|
|
return -1;
|
||
|
|
fd->ctr->record_counter = fd->record_counter;
|
||
|
|
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
fd->ctr->no_ref = fd->no_ref;
|
||
|
|
fd->ctr->embed_ref = fd->embed_ref;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
c = fd->ctr;
|
||
|
|
|
||
|
|
int embed_ref = c->embed_ref;
|
||
|
|
|
||
|
|
if (!c->slice || c->curr_rec == c->max_rec ||
|
||
|
|
(bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
|
||
|
|
(c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
|
||
|
|
int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
|
||
|
|
int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Start packing slices when we routinely have under 1/4tr full.
|
||
|
|
*
|
||
|
|
* This option isn't available if we choose to embed references
|
||
|
|
* since we can only have one per slice.
|
||
|
|
*
|
||
|
|
* The multi_seq var here refers to our intention for the next slice.
|
||
|
|
* This slice has already been encoded so we output as-is.
|
||
|
|
*/
|
||
|
|
if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
|
||
|
|
fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
|
||
|
|
embed_ref<=0) {
|
||
|
|
if (!c->multi_seq)
|
||
|
|
hts_log_info("Multi-ref enabled for next container");
|
||
|
|
multi_seq = 1;
|
||
|
|
} else if (fd->multi_seq == 1) {
|
||
|
|
pthread_mutex_lock(&fd->metrics_lock);
|
||
|
|
if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
|
||
|
|
multi_seq = 0;
|
||
|
|
hts_log_info("Multi-ref disabled for next container");
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->metrics_lock);
|
||
|
|
}
|
||
|
|
|
||
|
|
slice_rec = c->slice_rec;
|
||
|
|
curr_rec = c->curr_rec;
|
||
|
|
|
||
|
|
if (CRAM_MAJOR_VERS(fd->version) == 1 ||
|
||
|
|
c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
|
||
|
|
c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
|
||
|
|
if (NULL == (c = cram_next_container(fd, b))) {
|
||
|
|
if (fd->ctr) {
|
||
|
|
// prevent cram_close attempting to flush
|
||
|
|
fd->ctr_mt = fd->ctr; // delay free when threading
|
||
|
|
fd->ctr = NULL;
|
||
|
|
}
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Due to our processing order, some things we've already done we
|
||
|
|
* cannot easily undo. So when we first notice we should be packing
|
||
|
|
* multiple sequences per container we emit the small partial
|
||
|
|
* container as-is and then start a fresh one in a different mode.
|
||
|
|
*/
|
||
|
|
if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
|
||
|
|
// User selected auto-mode, we're currently using multi-seq, but
|
||
|
|
// have detected we don't need to. Switch back to auto.
|
||
|
|
fd->multi_seq = -1;
|
||
|
|
} else if (multi_seq) {
|
||
|
|
// We detected we need multi-seq
|
||
|
|
fd->multi_seq = 1;
|
||
|
|
c->multi_seq = 1;
|
||
|
|
c->pos_sorted = 0;
|
||
|
|
|
||
|
|
// Cram_next_container may end up flushing an existing one and
|
||
|
|
// triggering fd->embed_ref=2 if no reference is found.
|
||
|
|
// Embedded refs are incompatible with multi-seq, so we bail
|
||
|
|
// out and switch to no_ref in this scenario. We do this
|
||
|
|
// within the container only, as multi_seq may be temporary
|
||
|
|
// and we switch back away from it again.
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
|
||
|
|
hts_log_warning("Changing from embed_ref to no_ref mode");
|
||
|
|
// Should we update fd->embed_ref and no_ref here too?
|
||
|
|
// Doing so means if we go into multi-seq and back out
|
||
|
|
// again, eg due a cluster of tiny refs in the middle of
|
||
|
|
// much larger ones, then we bake in no-ref mode.
|
||
|
|
//
|
||
|
|
// However for unsorted data we're realistically not
|
||
|
|
// going to switch back.
|
||
|
|
c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
|
||
|
|
c->no_ref = fd->no_ref = 1;
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
|
||
|
|
if (!c->refs_used) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
c->refs_used = calloc(fd->refs->nref, sizeof(int));
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
if (!c->refs_used)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fd->last_slice = curr_rec - slice_rec;
|
||
|
|
c->slice_rec = c->curr_rec;
|
||
|
|
|
||
|
|
// Have we seen this reference before?
|
||
|
|
if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
|
||
|
|
embed_ref<=0 && !fd->unsorted && multi_seq) {
|
||
|
|
|
||
|
|
if (!c->refs_used) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
c->refs_used = calloc(fd->refs->nref, sizeof(int));
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
if (!c->refs_used)
|
||
|
|
return -1;
|
||
|
|
} else if (c->refs_used && c->refs_used[bam_ref(b)]) {
|
||
|
|
pthread_mutex_lock(&fd->ref_lock);
|
||
|
|
fd->unsorted = 1;
|
||
|
|
fd->multi_seq = 1;
|
||
|
|
pthread_mutex_unlock(&fd->ref_lock);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
c->curr_ref = bam_ref(b);
|
||
|
|
if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!c->bams) {
|
||
|
|
/* First time through, allocate a set of bam pointers */
|
||
|
|
pthread_mutex_lock(&fd->bam_list_lock);
|
||
|
|
if (fd->bl) {
|
||
|
|
spare_bams *spare = fd->bl;
|
||
|
|
c->bams = spare->bams;
|
||
|
|
fd->bl = spare->next;
|
||
|
|
free(spare);
|
||
|
|
} else {
|
||
|
|
c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
|
||
|
|
if (!c->bams) {
|
||
|
|
pthread_mutex_unlock(&fd->bam_list_lock);
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
pthread_mutex_unlock(&fd->bam_list_lock);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Copy or alloc+copy the bam record, for later encoding */
|
||
|
|
if (c->bams[c->curr_c_rec]) {
|
||
|
|
if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
|
||
|
|
return -1;
|
||
|
|
} else {
|
||
|
|
c->bams[c->curr_c_rec] = bam_dup1(b);
|
||
|
|
if (c->bams[c->curr_c_rec] == NULL)
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
if (bam_seq_len(b)) {
|
||
|
|
c->s_num_bases += bam_seq_len(b);
|
||
|
|
} else {
|
||
|
|
// No sequence in BAM record. CRAM doesn't directly support this
|
||
|
|
// case, it ends up being stored as a string of N's for each query
|
||
|
|
// consuming CIGAR operation. As this can become very inefficient
|
||
|
|
// in time and memory, data where the query length is excessively
|
||
|
|
// long are rejected.
|
||
|
|
hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
|
||
|
|
if (qlen > 100000000) {
|
||
|
|
hts_log_error("CIGAR query length %"PRIhts_pos
|
||
|
|
" for read \"%s\" is too long",
|
||
|
|
qlen, bam_get_qname(b));
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
c->s_num_bases += qlen;
|
||
|
|
}
|
||
|
|
c->curr_rec++;
|
||
|
|
c->curr_c_rec++;
|
||
|
|
c->s_aux_bytes += bam_get_l_aux(b);
|
||
|
|
c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
|
||
|
|
fd->record_counter++;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|