320 lines
10 KiB
C
320 lines
10 KiB
C
/*
|
|
Copyright (c) 2013-2019 Genome Research Ltd.
|
|
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
this list of conditions and the following disclaimer in the documentation
|
|
and/or other materials provided with the distribution.
|
|
|
|
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
Institute nor the names of its contributors may be used to endorse or promote
|
|
products derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*! \file
|
|
* SAM header parsing.
|
|
*
|
|
* These functions can be shared between SAM, BAM and CRAM file
|
|
* formats as all three internally use the same string encoding for
|
|
* header fields.
|
|
*/
|
|
|
|
|
|
#ifndef HEADER_H_
|
|
#define HEADER_H_
|
|
|
|
#include <stdarg.h>
|
|
|
|
#include "cram/string_alloc.h"
|
|
#include "cram/pooled_alloc.h"
|
|
|
|
#include "htslib/khash.h"
|
|
#include "htslib/kstring.h"
|
|
#include "htslib/sam.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*! Make a single integer out of a two-letter type code */
|
|
static inline khint32_t TYPEKEY(const char *type) {
|
|
unsigned int u0 = (unsigned char) type[0];
|
|
unsigned int u1 = (unsigned char) type[1];
|
|
return (u0 << 8) | u1;
|
|
}
|
|
|
|
/*
|
|
* Proposed new SAM header parsing
|
|
|
|
1 @SQ ID:foo LN:100
|
|
2 @SQ ID:bar LN:200
|
|
3 @SQ ID:ram LN:300 UR:xyz
|
|
4 @RG ID:r ...
|
|
5 @RG ID:s ...
|
|
|
|
Hash table for 2-char @keys without dup entries.
|
|
If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
|
|
|
|
HASH("SQ")--\
|
|
|
|
|
(3) <-> 1 <-> 2 <-> 3 <-> (1)
|
|
|
|
HASH("RG")--\
|
|
|
|
|
(5) <-> 4 <-> 5 <-> (4)
|
|
|
|
Items stored in the hash values also form their own linked lists:
|
|
Ie SQ->ID(foo)->LN(100)
|
|
SQ->ID(bar)->LN(200)
|
|
SQ->ID(ram)->LN(300)->UR(xyz)
|
|
RG->ID(r)
|
|
*/
|
|
|
|
/*! A single key:value pair on a header line
|
|
*
|
|
* These form a linked list and hold strings. The strings are
|
|
* allocated from a string_alloc_t pool referenced in the master
|
|
* sam_hrecs_t structure. Do not attempt to free, malloc or manipulate
|
|
* these strings directly.
|
|
*/
|
|
typedef struct sam_hrec_tag_s {
|
|
struct sam_hrec_tag_s *next;
|
|
const char *str;
|
|
int len;
|
|
} sam_hrec_tag_t;
|
|
|
|
/*! The parsed version of the SAM header string.
|
|
*
|
|
* Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type
|
|
* struct via the main hash table h in the sam_hrecs_t struct.
|
|
*
|
|
* These in turn consist of circular bi-directional linked lists (ie
|
|
* rings) to hold the multiple instances of the same header type
|
|
* code. For example if we have 5 \@SQ lines the primary hash table
|
|
* will key on \@SQ pointing to the first sam_hdr_type and that in turn
|
|
* will be part of a ring of 5 elements.
|
|
*
|
|
* For each sam_hdr_type structure we also point to a sam_hdr_tag
|
|
* structure which holds the tokenised attributes; the tab separated
|
|
* key:value pairs per line.
|
|
*/
|
|
typedef struct sam_hrec_type_s {
|
|
struct sam_hrec_type_s *next; // circular list of this type
|
|
struct sam_hrec_type_s *prev; // circular list of this type
|
|
struct sam_hrec_type_s *global_next; // circular list of all lines
|
|
struct sam_hrec_type_s *global_prev; // circular list of all lines
|
|
sam_hrec_tag_t *tag; // first tag
|
|
khint32_t type; // Two-letter type code as an int
|
|
} sam_hrec_type_t;
|
|
|
|
/*! Parsed \@SQ lines */
|
|
typedef struct {
|
|
const char *name;
|
|
hts_pos_t len;
|
|
sam_hrec_type_t *ty;
|
|
} sam_hrec_sq_t;
|
|
|
|
/*! Parsed \@RG lines */
|
|
typedef struct {
|
|
const char *name;
|
|
sam_hrec_type_t *ty;
|
|
int name_len;
|
|
int id; // numerical ID
|
|
} sam_hrec_rg_t;
|
|
|
|
/*! Parsed \@PG lines */
|
|
typedef struct {
|
|
const char *name;
|
|
sam_hrec_type_t *ty;
|
|
int name_len;
|
|
int id; // numerical ID
|
|
int prev_id; // -1 if none
|
|
} sam_hrec_pg_t;
|
|
|
|
|
|
/*! Sort order parsed from @HD line */
|
|
enum sam_sort_order {
|
|
ORDER_UNKNOWN =-1,
|
|
ORDER_UNSORTED = 0,
|
|
ORDER_NAME = 1,
|
|
ORDER_COORD = 2
|
|
//ORDER_COLLATE = 3 // maybe one day!
|
|
};
|
|
|
|
enum sam_group_order {
|
|
ORDER_NONE =-1,
|
|
ORDER_QUERY = 0,
|
|
ORDER_REFERENCE = 1
|
|
};
|
|
|
|
KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*)
|
|
KHASH_MAP_INIT_STR(m_s2i, int)
|
|
|
|
/*! Primary structure for header manipulation
|
|
*
|
|
* The initial header text is held in the text kstring_t, but is also
|
|
* parsed out into SQ, RG and PG arrays. These have a hash table
|
|
* associated with each to allow lookup by ID or SN fields instead of
|
|
* their numeric array indices. Additionally PG has an array to hold
|
|
* the linked list start points (the last in a PP chain).
|
|
*
|
|
* Use the appropriate sam_hdr_* functions to edit the header, and
|
|
* call sam_hdr_rebuild() any time the textual form needs to be
|
|
* updated again.
|
|
*/
|
|
struct sam_hrecs_t {
|
|
khash_t(sam_hrecs_t) *h;
|
|
sam_hrec_type_t *first_line; //!< First line (usually @HD)
|
|
string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings
|
|
pool_alloc_t *type_pool;//!< Pool of sam_hdr_type structs
|
|
pool_alloc_t *tag_pool; //!< Pool of sam_hdr_tag structs
|
|
|
|
// @SQ lines / references
|
|
int nref; //!< Number of \@SQ lines
|
|
int ref_sz; //!< Number of entries available in ref[]
|
|
sam_hrec_sq_t *ref; //!< Array of parsed \@SQ lines
|
|
khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to ref[] index
|
|
|
|
// @RG lines / read-groups
|
|
int nrg; //!< Number of \@RG lines
|
|
int rg_sz; //!< number of entries available in rg[]
|
|
sam_hrec_rg_t *rg; //!< Array of parsed \@RG lines
|
|
khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
|
|
|
|
// @PG lines / programs
|
|
int npg; //!< Number of \@PG lines
|
|
int pg_sz; //!< Number of entries available in pg[]
|
|
int npg_end; //!< Number of terminating \@PG lines
|
|
int npg_end_alloc; //!< Size of pg_end field
|
|
sam_hrec_pg_t *pg; //!< Array of parsed \@PG lines
|
|
khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
|
|
int *pg_end; //!< \@PG chain termination IDs
|
|
|
|
// @cond internal
|
|
char *ID_buf; // temporary buffer for sam_hdr_pg_id
|
|
uint32_t ID_buf_sz;
|
|
int ID_cnt;
|
|
// @endcond
|
|
|
|
int dirty; // marks the header as modified, so it can be rebuilt
|
|
int refs_changed; // Index of first changed ref (-1 if unchanged)
|
|
int pgs_changed; // New PG line added
|
|
int type_count;
|
|
char (*type_order)[3];
|
|
};
|
|
|
|
/*!
|
|
* Method for parsing the header text and populating the
|
|
* internal hash tables. After calling this method, the
|
|
* parsed representation becomes the single source of truth.
|
|
*
|
|
* @param bh Header structure, previously initialised by a
|
|
* sam_hdr_init call
|
|
* @return 0 on success, -1 on failure
|
|
*/
|
|
int sam_hdr_fill_hrecs(sam_hdr_t *bh);
|
|
|
|
/*!
|
|
* Reconstructs the text representation of the header from
|
|
* the hash table data after a change has been performed on
|
|
* the header.
|
|
*
|
|
* @return 0 on success, -1 on failure
|
|
*/
|
|
int sam_hdr_rebuild(sam_hdr_t *bh);
|
|
|
|
/*! Creates an empty SAM header, ready to be populated.
|
|
*
|
|
* @return
|
|
* Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
|
|
* NULL on failure
|
|
*/
|
|
sam_hrecs_t *sam_hrecs_new(void);
|
|
|
|
/*! Produces a duplicate copy of hrecs and returns it.
|
|
* @return
|
|
* Returns NULL on failure
|
|
*/
|
|
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs);
|
|
|
|
/*! Update sam_hdr_t target_name and target_len arrays
|
|
*
|
|
* sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup
|
|
* can use it to construct target arrays from the source header.
|
|
*
|
|
* @return 0 on success; -1 on failure
|
|
*/
|
|
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
|
|
int refs_changed);
|
|
|
|
/*! Reconstructs a kstring from the header hash table.
|
|
*
|
|
* @return
|
|
* Returns 0 on success
|
|
* -1 on failure
|
|
*/
|
|
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks);
|
|
|
|
/*! Deallocates all storage used by a sam_hrecs_t struct.
|
|
*
|
|
* This also decrements the header reference count. If after decrementing
|
|
* it is still non-zero then the header is assumed to be in use by another
|
|
* caller and the free is not done.
|
|
*/
|
|
void sam_hrecs_free(sam_hrecs_t *hrecs);
|
|
|
|
/*!
|
|
* @return
|
|
* Returns the first header item matching 'type'. If ID is non-NULL it checks
|
|
* for the tag ID: and compares against the specified ID.
|
|
*
|
|
* Returns NULL if no type/ID is found
|
|
*/
|
|
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
|
|
const char *ID_key, const char *ID_value);
|
|
|
|
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
|
|
const char *key,
|
|
sam_hrec_tag_t **prev);
|
|
|
|
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
|
|
sam_hrec_type_t *type,
|
|
const char *key);
|
|
|
|
/*! Looks up a read-group by name and returns a pointer to the start of the
|
|
* associated tag list.
|
|
*
|
|
* @return
|
|
* Returns NULL on failure
|
|
*/
|
|
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg);
|
|
|
|
/*! Returns the sort order from the @HD SO: field */
|
|
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs);
|
|
|
|
/*! Returns the group order from the @HD SO: field */
|
|
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* HEADER_H_ */
|