708 lines
22 KiB
C
708 lines
22 KiB
C
/*
|
|
Copyright (C) 2017-2021 Genome Research Ltd.
|
|
|
|
Author: Petr Danecek <pd3@sanger.ac.uk>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
|
|
#include <config.h>
|
|
|
|
#include <assert.h>
|
|
#include <strings.h>
|
|
|
|
#include "bcf_sr_sort.h"
|
|
#include "htslib/khash_str2int.h"
|
|
#include "htslib/kbitset.h"
|
|
|
|
#define SR_REF 1
|
|
#define SR_SNP 2
|
|
#define SR_INDEL 4
|
|
#define SR_OTHER 8
|
|
#define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
|
|
|
|
// Logical AND
|
|
static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
|
|
{
|
|
// General case, bitsets of unequal size:
|
|
// int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
|
|
int i, n = bs1->n;
|
|
|
|
for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
|
|
return 0;
|
|
}
|
|
|
|
// Bitwise OR, dst will be modified, src will be left unchanged
|
|
static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
|
|
{
|
|
int i;
|
|
for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
|
|
}
|
|
|
|
|
|
static void bcf_sr_init_scores(sr_sort_t *srt)
|
|
{
|
|
int i,jbit,kbit;
|
|
|
|
// lower number = lower priority, zero means forbidden
|
|
|
|
if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
|
|
if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
|
|
if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
|
|
if ( srt->pair & BCF_SR_PAIR_SNP_REF )
|
|
{
|
|
SR_SCORE(srt,SR_SNP,SR_REF) = 2;
|
|
SR_SCORE(srt,SR_REF,SR_SNP) = 2;
|
|
}
|
|
if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
|
|
{
|
|
SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
|
|
SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
|
|
}
|
|
if ( srt->pair & BCF_SR_PAIR_ANY )
|
|
{
|
|
for (i=0; i<256; i++)
|
|
if ( !srt->score[i] ) srt->score[i] = 1;
|
|
}
|
|
|
|
// set all combinations
|
|
for (i=0; i<256; i++)
|
|
{
|
|
if ( srt->score[i] ) continue; // already set
|
|
int max = 0;
|
|
for (jbit=0; jbit<4; jbit++) // high bits
|
|
{
|
|
int j = 1<<jbit;
|
|
if ( !(i & (j<<4)) ) continue;
|
|
for (kbit=0; kbit<4; kbit++) // low bits
|
|
{
|
|
int k = 1<<kbit;
|
|
if ( !(i & k) ) continue;
|
|
if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
|
|
}
|
|
}
|
|
srt->score[i] = max;
|
|
}
|
|
}
|
|
static int multi_is_exact(var_t *avar, var_t *bvar)
|
|
{
|
|
if ( avar->nalt != bvar->nalt ) return 0;
|
|
|
|
int alen = strlen(avar->str);
|
|
int blen = strlen(bvar->str);
|
|
if ( alen != blen ) return 0;
|
|
|
|
char *abeg = avar->str;
|
|
while ( *abeg )
|
|
{
|
|
char *aend = abeg;
|
|
while ( *aend && *aend!=',' ) aend++;
|
|
|
|
char *bbeg = bvar->str;
|
|
while ( *bbeg )
|
|
{
|
|
char *bend = bbeg;
|
|
while ( *bend && *bend!=',' ) bend++;
|
|
if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
|
|
bbeg = *bend ? bend+1 : bend;
|
|
}
|
|
if ( !*bbeg ) return 0;
|
|
|
|
abeg = *aend ? aend+1 : aend;
|
|
}
|
|
return 1;
|
|
}
|
|
static int multi_is_subset(var_t *avar, var_t *bvar)
|
|
{
|
|
char *abeg = avar->str;
|
|
while ( *abeg )
|
|
{
|
|
char *aend = abeg;
|
|
while ( *aend && *aend!=',' ) aend++;
|
|
|
|
char *bbeg = bvar->str;
|
|
while ( *bbeg )
|
|
{
|
|
char *bend = bbeg;
|
|
while ( *bend && *bend!=',' ) bend++;
|
|
if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
|
|
bbeg = *bend ? bend+1 : bend;
|
|
}
|
|
abeg = *aend ? aend+1 : aend;
|
|
}
|
|
return 0;
|
|
}
|
|
static uint32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
|
|
{
|
|
varset_t *iv = &srt->vset[ivset];
|
|
varset_t *jv = &srt->vset[jvset];
|
|
|
|
// Restrictive logic: the strictest type from a group is selected,
|
|
// so that, for example, snp+ref does not lead to the inclusion of an indel
|
|
int i,j;
|
|
uint32_t min = UINT32_MAX;
|
|
for (i=0; i<iv->nvar; i++)
|
|
{
|
|
var_t *ivar = &srt->var[iv->var[i]];
|
|
for (j=0; j<jv->nvar; j++)
|
|
{
|
|
var_t *jvar = &srt->var[jv->var[j]];
|
|
if ( srt->pair & BCF_SR_PAIR_EXACT )
|
|
{
|
|
if ( ivar->type != jvar->type ) continue;
|
|
if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
|
|
if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
|
|
continue;
|
|
}
|
|
if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
|
|
if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
|
|
|
|
uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
|
|
if ( !score ) return 0; // some of the varsets in the two groups are not compatible, will not pair
|
|
if ( min>score ) min = score;
|
|
}
|
|
}
|
|
if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
|
|
|
|
assert( min!=UINT32_MAX );
|
|
|
|
uint32_t cnt = 0;
|
|
for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
|
|
for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
|
|
|
|
return (1u<<(28+min)) + cnt;
|
|
}
|
|
static void remove_vset(sr_sort_t *srt, int jvset)
|
|
{
|
|
if ( jvset+1 < srt->nvset )
|
|
{
|
|
varset_t tmp = srt->vset[jvset];
|
|
memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
|
|
srt->vset[srt->nvset-1] = tmp;
|
|
|
|
int *jmat = srt->pmat + jvset*srt->ngrp;
|
|
memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
|
|
|
|
memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
|
|
}
|
|
srt->nvset--;
|
|
}
|
|
static int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
|
|
{
|
|
int i,j;
|
|
if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
|
|
|
|
varset_t *iv = &srt->vset[ivset];
|
|
varset_t *jv = &srt->vset[jvset];
|
|
|
|
kbs_bitwise_or(iv->mask,jv->mask);
|
|
|
|
i = iv->nvar;
|
|
iv->nvar += jv->nvar;
|
|
hts_expand(int, iv->nvar, iv->mvar, iv->var);
|
|
for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
|
|
|
|
int *imat = srt->pmat + ivset*srt->ngrp;
|
|
int *jmat = srt->pmat + jvset*srt->ngrp;
|
|
for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
|
|
srt->cnt[ivset] += srt->cnt[jvset];
|
|
|
|
remove_vset(srt, jvset);
|
|
|
|
return ivset;
|
|
}
|
|
|
|
static int push_vset(sr_sort_t *srt, int ivset)
|
|
{
|
|
varset_t *iv = &srt->vset[ivset];
|
|
int i,j;
|
|
for (i=0; i<srt->sr->nreaders; i++)
|
|
{
|
|
vcf_buf_t *buf = &srt->vcf_buf[i];
|
|
buf->nrec++;
|
|
hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
|
|
buf->rec[buf->nrec-1] = NULL;
|
|
}
|
|
for (i=0; i<iv->nvar; i++)
|
|
{
|
|
var_t *var = &srt->var[ iv->var[i] ];
|
|
for (j=0; j<var->nvcf; j++)
|
|
{
|
|
int jvcf = var->vcf[j];
|
|
vcf_buf_t *buf = &srt->vcf_buf[jvcf];
|
|
buf->rec[buf->nrec-1] = var->rec[j];
|
|
}
|
|
}
|
|
remove_vset(srt, ivset);
|
|
return 0; // FIXME: check for errs in this function
|
|
}
|
|
|
|
static int cmpstringp(const void *p1, const void *p2)
|
|
{
|
|
return strcmp(* (char * const *) p1, * (char * const *) p2);
|
|
}
|
|
|
|
#define DEBUG_VSETS 0
|
|
#if DEBUG_VSETS
|
|
void debug_vsets(sr_sort_t *srt)
|
|
{
|
|
int i,j,k;
|
|
for (i=0; i<srt->nvset; i++)
|
|
{
|
|
fprintf(stderr,"dbg_vset %d:", i);
|
|
for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
|
|
fprintf(stderr,"\t");
|
|
for (j=0; j<srt->vset[i].nvar; j++)
|
|
{
|
|
var_t *var = &srt->var[srt->vset[i].var[j]];
|
|
fprintf(stderr,"\t%s",var->str);
|
|
for (k=0; k<var->nvcf; k++)
|
|
fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
|
|
}
|
|
fprintf(stderr,"\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#define DEBUG_VBUF 0
|
|
#if DEBUG_VBUF
|
|
void debug_vbuf(sr_sort_t *srt)
|
|
{
|
|
int i, j;
|
|
for (j=0; j<srt->vcf_buf[0].nrec; j++)
|
|
{
|
|
fprintf(stderr,"dbg_vbuf %d:\t", j);
|
|
for (i=0; i<srt->sr->nreaders; i++)
|
|
{
|
|
vcf_buf_t *buf = &srt->vcf_buf[i];
|
|
fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0);
|
|
}
|
|
fprintf(stderr,"\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static char *grp_create_key(sr_sort_t *srt)
|
|
{
|
|
if ( !srt->str.l ) return strdup("");
|
|
int i;
|
|
hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
|
|
for (i=0; i<srt->noff; i++)
|
|
{
|
|
srt->charp[i] = srt->str.s + srt->off[i];
|
|
if ( i>0 ) srt->charp[i][-1] = 0;
|
|
}
|
|
qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
|
|
char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
|
|
for (i=0; i<srt->noff; i++)
|
|
{
|
|
int len = strlen(srt->charp[i]);
|
|
memcpy(ptr, srt->charp[i], len);
|
|
ptr += len + 1;
|
|
ptr[-1] = i+1==srt->noff ? 0 : ';';
|
|
}
|
|
return ret;
|
|
}
|
|
int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
|
|
{
|
|
hts_expand(int,idx+1,srt->mactive,srt->active);
|
|
srt->nactive = 1;
|
|
srt->active[srt->nactive - 1] = idx;
|
|
return 0; // FIXME: check for errs in this function
|
|
}
|
|
int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
|
|
{
|
|
hts_expand(int,idx+1,srt->mactive,srt->active);
|
|
srt->nactive++;
|
|
srt->active[srt->nactive - 1] = idx;
|
|
return 0; // FIXME: check for errs in this function
|
|
}
|
|
static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
|
|
{
|
|
if ( !srt->grp_str2int )
|
|
{
|
|
// first time here, initialize
|
|
if ( !srt->pair )
|
|
{
|
|
if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
|
|
bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
|
|
}
|
|
bcf_sr_init_scores(srt);
|
|
srt->grp_str2int = khash_str2int_init();
|
|
srt->var_str2int = khash_str2int_init();
|
|
}
|
|
int k;
|
|
khash_t(str2int) *hash;
|
|
hash = srt->grp_str2int;
|
|
for (k=0; k < kh_end(hash); k++)
|
|
if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
|
|
hash = srt->var_str2int;
|
|
for (k=0; k < kh_end(hash); k++)
|
|
if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
|
|
kh_clear(str2int, srt->grp_str2int);
|
|
kh_clear(str2int, srt->var_str2int);
|
|
srt->ngrp = srt->nvar = srt->nvset = 0;
|
|
|
|
grp_t grp;
|
|
memset(&grp,0,sizeof(grp_t));
|
|
|
|
// group VCFs into groups, each with a unique combination of variants in the duplicate lines
|
|
int ireader,ivar,irec,igrp,ivset,iact;
|
|
for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
|
|
for (iact=0; iact<srt->nactive; iact++)
|
|
{
|
|
ireader = srt->active[iact];
|
|
bcf_sr_t *reader = &readers->readers[ireader];
|
|
int rid = bcf_hdr_name2id(reader->header, chr);
|
|
grp.nvar = 0;
|
|
hts_expand(int,reader->nbuffer,srt->moff,srt->off);
|
|
srt->noff = 0;
|
|
srt->str.l = 0;
|
|
for (irec=1; irec<=reader->nbuffer; irec++)
|
|
{
|
|
bcf1_t *line = reader->buffer[irec];
|
|
if ( line->rid!=rid || line->pos!=min_pos ) break;
|
|
|
|
if ( srt->str.l ) kputc(';',&srt->str);
|
|
srt->off[srt->noff++] = srt->str.l;
|
|
size_t beg = srt->str.l;
|
|
int end_pos = -1;
|
|
for (ivar=1; ivar<line->n_allele; ivar++)
|
|
{
|
|
if ( ivar>1 ) kputc(',',&srt->str);
|
|
kputs(line->d.allele[0],&srt->str);
|
|
kputc('>',&srt->str);
|
|
kputs(line->d.allele[ivar],&srt->str);
|
|
|
|
// If symbolic allele, check also the END tag in case there are multiple events,
|
|
// such as <DEL>s, starting at the same positions
|
|
if ( line->d.allele[ivar][0]=='<' )
|
|
{
|
|
if ( end_pos==-1 )
|
|
{
|
|
bcf_info_t *end_info = bcf_get_info(reader->header,line,"END");
|
|
if ( end_info )
|
|
end_pos = (int)end_info->v1.i; // this is only to create a unique id, we don't mind a potential int64 overflow
|
|
else
|
|
end_pos = 0;
|
|
}
|
|
if ( end_pos )
|
|
{
|
|
kputc('/',&srt->str);
|
|
kputw(end_pos, &srt->str);
|
|
}
|
|
}
|
|
}
|
|
if ( line->n_allele==1 )
|
|
{
|
|
kputs(line->d.allele[0],&srt->str);
|
|
kputsn(">.",2,&srt->str);
|
|
}
|
|
|
|
// Create new variant or attach to existing one. But careful, there can be duplicate
|
|
// records with the same POS,REF,ALT (e.g. in dbSNP-b142)
|
|
char *var_str = beg + srt->str.s;
|
|
int ret, var_idx = 0, var_end = srt->str.l;
|
|
while ( 1 )
|
|
{
|
|
ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
|
|
if ( ret==-1 ) break;
|
|
|
|
var_t *var = &srt->var[ivar];
|
|
if ( var->vcf[var->nvcf-1] != ireader ) break;
|
|
|
|
srt->str.l = var_end;
|
|
kputw(var_idx, &srt->str);
|
|
var_str = beg + srt->str.s;
|
|
var_idx++;
|
|
}
|
|
if ( ret==-1 )
|
|
{
|
|
ivar = srt->nvar++;
|
|
hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
|
|
srt->var[ivar].nvcf = 0;
|
|
khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
|
|
free(srt->var[ivar].str); // possible left-over from the previous position
|
|
}
|
|
var_t *var = &srt->var[ivar];
|
|
var->nalt = line->n_allele - 1;
|
|
var->type = bcf_get_variant_types(line);
|
|
srt->str.s[var_end] = 0;
|
|
if ( ret==-1 )
|
|
var->str = strdup(var_str);
|
|
|
|
int mvcf = var->mvcf;
|
|
var->nvcf++;
|
|
hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
|
|
if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
|
|
var->vcf[var->nvcf-1] = ireader;
|
|
var->rec[var->nvcf-1] = line;
|
|
|
|
grp.nvar++;
|
|
hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
|
|
grp.var[grp.nvar-1] = ivar;
|
|
}
|
|
char *grp_key = grp_create_key(srt);
|
|
int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
|
|
if ( ret==-1 )
|
|
{
|
|
igrp = srt->ngrp++;
|
|
hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
|
|
free(srt->grp[igrp].var);
|
|
srt->grp[igrp] = grp;
|
|
srt->grp[igrp].key = grp_key;
|
|
khash_str2int_set(srt->grp_str2int, grp_key, igrp);
|
|
memset(&grp,0,sizeof(grp_t));
|
|
}
|
|
else
|
|
free(grp_key);
|
|
srt->grp[igrp].nvcf++;
|
|
}
|
|
free(grp.var);
|
|
|
|
// initialize bitmask - which groups is the variant present in
|
|
for (ivar=0; ivar<srt->nvar; ivar++)
|
|
{
|
|
if ( kbs_resize(&srt->var[ivar].mask, srt->ngrp) < 0 )
|
|
{
|
|
fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__);
|
|
exit(1);
|
|
}
|
|
kbs_clear(srt->var[ivar].mask);
|
|
}
|
|
for (igrp=0; igrp<srt->ngrp; igrp++)
|
|
{
|
|
for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
|
|
{
|
|
int i = srt->grp[igrp].var[ivar];
|
|
kbs_insert(srt->var[i].mask, igrp);
|
|
}
|
|
}
|
|
|
|
// create the initial list of variant sets
|
|
for (ivar=0; ivar<srt->nvar; ivar++)
|
|
{
|
|
ivset = srt->nvset++;
|
|
hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
|
|
|
|
varset_t *vset = &srt->vset[ivset];
|
|
vset->nvar = 1;
|
|
hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
|
|
vset->var[vset->nvar-1] = ivar;
|
|
var_t *var = &srt->var[ivar];
|
|
vset->cnt = var->nvcf;
|
|
if ( kbs_resize(&vset->mask, srt->ngrp) < 0 )
|
|
{
|
|
fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__);
|
|
exit(1);
|
|
}
|
|
kbs_clear(vset->mask);
|
|
kbs_bitwise_or(vset->mask, var->mask);
|
|
|
|
int type = 0;
|
|
if ( var->type==VCF_REF ) type |= SR_REF;
|
|
else
|
|
{
|
|
if ( var->type & VCF_SNP ) type |= SR_SNP;
|
|
if ( var->type & VCF_MNP ) type |= SR_SNP;
|
|
if ( var->type & VCF_INDEL ) type |= SR_INDEL;
|
|
if ( var->type & VCF_OTHER ) type |= SR_OTHER;
|
|
}
|
|
var->type = type;
|
|
}
|
|
#if DEBUG_VSETS
|
|
debug_vsets(srt);
|
|
#endif
|
|
|
|
// initialize the pairing matrix
|
|
hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
|
|
hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
|
|
memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
|
|
for (ivset=0; ivset<srt->nvset; ivset++)
|
|
{
|
|
varset_t *vset = &srt->vset[ivset];
|
|
for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
|
|
srt->cnt[ivset] = vset->cnt;
|
|
}
|
|
|
|
// pair the lines
|
|
while ( srt->nvset )
|
|
{
|
|
#if DEBUG_VSETS
|
|
fprintf(stderr,"\n");
|
|
debug_vsets(srt);
|
|
#endif
|
|
|
|
int imax = 0;
|
|
for (ivset=1; ivset<srt->nvset; ivset++)
|
|
if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
|
|
|
|
int ipair = -1;
|
|
uint32_t max_score = 0;
|
|
for (ivset=0; ivset<srt->nvset; ivset++)
|
|
{
|
|
if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged
|
|
uint32_t score = pairing_score(srt, imax, ivset);
|
|
// fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
|
|
if ( max_score < score ) { max_score = score; ipair = ivset; }
|
|
}
|
|
|
|
// merge rows creating a new variant set this way
|
|
if ( ipair!=-1 && ipair!=imax )
|
|
{
|
|
imax = merge_vsets(srt, imax, ipair);
|
|
continue;
|
|
}
|
|
|
|
push_vset(srt, imax);
|
|
}
|
|
|
|
srt->chr = chr;
|
|
srt->pos = min_pos;
|
|
|
|
return 0; // FIXME: check for errs in this function
|
|
}
|
|
|
|
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
|
|
{
|
|
int i,j;
|
|
assert( srt->nactive>0 );
|
|
|
|
if ( srt->nsr != readers->nreaders )
|
|
{
|
|
srt->sr = readers;
|
|
if ( srt->nsr < readers->nreaders )
|
|
{
|
|
srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
|
|
memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
|
|
if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
|
|
}
|
|
srt->nsr = readers->nreaders;
|
|
srt->chr = NULL;
|
|
}
|
|
if ( srt->nactive == 1 )
|
|
{
|
|
if ( readers->nreaders>1 )
|
|
memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
|
|
bcf_sr_t *reader = &readers->readers[srt->active[0]];
|
|
assert( reader->buffer[1]->pos==min_pos );
|
|
bcf1_t *tmp = reader->buffer[0];
|
|
for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
|
|
reader->buffer[ reader->nbuffer ] = tmp;
|
|
reader->nbuffer--;
|
|
readers->has_line[srt->active[0]] = 1;
|
|
return 1;
|
|
}
|
|
if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
|
|
|
|
if ( !srt->vcf_buf[0].nrec ) return 0;
|
|
|
|
#if DEBUG_VBUF
|
|
debug_vbuf(srt);
|
|
#endif
|
|
|
|
int nret = 0;
|
|
for (i=0; i<srt->sr->nreaders; i++)
|
|
{
|
|
vcf_buf_t *buf = &srt->vcf_buf[i];
|
|
|
|
if ( buf->rec[0] )
|
|
{
|
|
bcf_sr_t *reader = &srt->sr->readers[i];
|
|
for (j=1; j<=reader->nbuffer; j++)
|
|
if ( reader->buffer[j] == buf->rec[0] ) break;
|
|
|
|
assert( j<=reader->nbuffer );
|
|
|
|
bcf1_t *tmp = reader->buffer[0];
|
|
reader->buffer[0] = reader->buffer[j++];
|
|
for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
|
|
reader->buffer[ reader->nbuffer ] = tmp;
|
|
reader->nbuffer--;
|
|
|
|
nret++;
|
|
srt->sr->has_line[i] = 1;
|
|
}
|
|
else
|
|
srt->sr->has_line[i] = 0;
|
|
|
|
buf->nrec--;
|
|
if ( buf->nrec > 0 )
|
|
memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
|
|
}
|
|
return nret;
|
|
}
|
|
void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
|
|
{
|
|
//vcf_buf is allocated only in bcf_sr_sort_next
|
|
//So, a call to bcf_sr_add_reader() followed immediately by bcf_sr_remove_reader()
|
|
//would cause the program to crash in this segment
|
|
if (srt->vcf_buf)
|
|
{
|
|
free(srt->vcf_buf[i].rec);
|
|
if ( i+1 < srt->nsr )
|
|
memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
|
|
memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
|
|
}
|
|
}
|
|
sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
|
|
{
|
|
if ( !srt ) return calloc(1,sizeof(sr_sort_t));
|
|
memset(srt,0,sizeof(sr_sort_t));
|
|
return srt;
|
|
}
|
|
void bcf_sr_sort_reset(sr_sort_t *srt)
|
|
{
|
|
srt->chr = NULL;
|
|
}
|
|
void bcf_sr_sort_destroy(sr_sort_t *srt)
|
|
{
|
|
free(srt->active);
|
|
if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
|
|
if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
|
|
int i;
|
|
for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
|
|
free(srt->vcf_buf);
|
|
for (i=0; i<srt->mvar; i++)
|
|
{
|
|
free(srt->var[i].str);
|
|
free(srt->var[i].vcf);
|
|
free(srt->var[i].rec);
|
|
kbs_destroy(srt->var[i].mask);
|
|
}
|
|
free(srt->var);
|
|
for (i=0; i<srt->mgrp; i++)
|
|
free(srt->grp[i].var);
|
|
free(srt->grp);
|
|
for (i=0; i<srt->mvset; i++)
|
|
{
|
|
kbs_destroy(srt->vset[i].mask);
|
|
free(srt->vset[i].var);
|
|
}
|
|
free(srt->vset);
|
|
free(srt->str.s);
|
|
free(srt->off);
|
|
free(srt->charp);
|
|
free(srt->cnt);
|
|
free(srt->pmat);
|
|
memset(srt,0,sizeof(*srt));
|
|
}
|
|
|