191 lines
5.4 KiB
C
191 lines
5.4 KiB
C
/* vcf_sweep.c -- forward/reverse sweep API.
|
|
|
|
Copyright (C) 2013-2014, 2019 Genome Research Ltd.
|
|
|
|
Author: Petr Danecek <pd3@sanger.ac.uk>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
DEALINGS IN THE SOFTWARE. */
|
|
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
|
#include <config.h>
|
|
|
|
#include <assert.h>
|
|
|
|
#include "htslib/vcf_sweep.h"
|
|
#include "htslib/bgzf.h"
|
|
|
|
#define SW_FWD 0
|
|
#define SW_BWD 1
|
|
|
|
struct bcf_sweep_t
|
|
{
|
|
htsFile *file;
|
|
bcf_hdr_t *hdr;
|
|
BGZF *fp;
|
|
|
|
int direction; // to tell if the direction has changed
|
|
int block_size; // the size of uncompressed data to hold in memory
|
|
bcf1_t *rec; // bcf buffer
|
|
int nrec, mrec; // number of used records; total size of the buffer
|
|
int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record
|
|
char *lals;
|
|
|
|
uint64_t *idx; // uncompressed offsets of VCF/BCF records
|
|
int iidx, nidx, midx; // i: current offset; n: used; m: allocated
|
|
int idx_done; // the index is built during the first pass
|
|
};
|
|
|
|
BGZF *hts_get_bgzfp(htsFile *fp);
|
|
int hts_useek(htsFile *file, off_t uoffset, int where);
|
|
off_t hts_utell(htsFile *file);
|
|
|
|
static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
|
|
{
|
|
if ( sw->lrid!=rec->rid ) return 0;
|
|
if ( sw->lpos!=rec->pos ) return 0;
|
|
if ( sw->lnals!=rec->n_allele ) return 0;
|
|
|
|
char *t = rec->d.allele[sw->lnals-1];
|
|
int len = t - rec->d.allele[0] + 1;
|
|
while ( *t ) { t++; len++; }
|
|
if ( sw->lals_len!=len ) return 0;
|
|
if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
|
|
return 1;
|
|
}
|
|
|
|
static int sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
|
|
{
|
|
sw->lrid = rec->rid;
|
|
sw->lpos = rec->pos;
|
|
sw->lnals = rec->n_allele;
|
|
|
|
char *t = rec->d.allele[sw->lnals-1];
|
|
int len = t - rec->d.allele[0] + 1;
|
|
while ( *t ) { t++; len++; }
|
|
sw->lals_len = len;
|
|
hts_expand(char, len, sw->mlals, sw->lals);
|
|
memcpy(sw->lals, rec->d.allele[0], len);
|
|
|
|
return 0; // FIXME: check for errs in this function
|
|
}
|
|
|
|
static int sw_fill_buffer(bcf_sweep_t *sw)
|
|
{
|
|
if ( !sw->iidx ) return 0;
|
|
sw->iidx--;
|
|
|
|
int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
|
|
assert( ret==0 );
|
|
|
|
sw->nrec = 0;
|
|
bcf1_t *rec = &sw->rec[sw->nrec];
|
|
while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
|
|
{
|
|
bcf_unpack(rec, BCF_UN_STR);
|
|
|
|
// if not in the last block, stop at the saved record
|
|
if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
|
|
|
|
sw->nrec++;
|
|
hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
|
|
rec = &sw->rec[sw->nrec];
|
|
}
|
|
sw_rec_save(sw, &sw->rec[0]);
|
|
|
|
return 0; // FIXME: check for errs in this function
|
|
}
|
|
|
|
bcf_sweep_t *bcf_sweep_init(const char *fname)
|
|
{
|
|
bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
|
|
sw->file = hts_open(fname, "r");
|
|
sw->fp = hts_get_bgzfp(sw->file);
|
|
if (sw->fp) bgzf_index_build_init(sw->fp);
|
|
sw->hdr = bcf_hdr_read(sw->file);
|
|
sw->mrec = 1;
|
|
sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
|
|
sw->block_size = 1024*1024*3;
|
|
sw->direction = SW_FWD;
|
|
return sw;
|
|
}
|
|
|
|
void bcf_sweep_destroy(bcf_sweep_t *sw)
|
|
{
|
|
int i;
|
|
for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
|
|
free(sw->idx);
|
|
free(sw->rec);
|
|
free(sw->lals);
|
|
bcf_hdr_destroy(sw->hdr);
|
|
hts_close(sw->file);
|
|
free(sw);
|
|
}
|
|
|
|
static void sw_seek(bcf_sweep_t *sw, int direction)
|
|
{
|
|
sw->direction = direction;
|
|
if ( direction==SW_FWD )
|
|
hts_useek(sw->file, sw->idx[0], 0);
|
|
else
|
|
{
|
|
sw->iidx = sw->nidx;
|
|
sw->nrec = 0;
|
|
}
|
|
}
|
|
|
|
bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw)
|
|
{
|
|
if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
|
|
|
|
off_t pos = hts_utell(sw->file);
|
|
|
|
bcf1_t *rec = &sw->rec[0];
|
|
int ret = bcf_read1(sw->file, sw->hdr, rec);
|
|
|
|
if ( ret!=0 ) // last record, get ready for sweeping backwards
|
|
{
|
|
sw->idx_done = 1;
|
|
if (sw->fp) sw->fp->idx_build_otf = 0;
|
|
sw_seek(sw, SW_BWD);
|
|
return NULL;
|
|
}
|
|
|
|
if ( !sw->idx_done )
|
|
{
|
|
if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
|
|
{
|
|
sw->nidx++;
|
|
hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
|
|
sw->idx[sw->nidx-1] = pos;
|
|
}
|
|
}
|
|
return rec;
|
|
}
|
|
|
|
bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw)
|
|
{
|
|
if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
|
|
if ( !sw->nrec ) sw_fill_buffer(sw);
|
|
if ( !sw->nrec ) return NULL;
|
|
return &sw->rec[ --sw->nrec ];
|
|
}
|
|
|
|
bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }
|
|
|