Added ability to call multiallelic indels, if -multiallelic is included in UG arguments. Simple idea: we genotype all alleles with count >= minIndelCnt.
To support this, refactored code that computes consensus alleles. To ease merging of mulitple alt alleles, we create a single vc for each alt alleles and then use VariantContextUtils.simpleMerge to carry out merging, which takes care of handling all corner conditions already. In order to use this, interface to GenotypeLikelihoodsCalculationModel changed to pass in a GenomeLocParser object (why are these objects to hard to handle??). More testing is required and feature turned off my default.
This commit is contained in:
parent
43224ef364
commit
d4e7655d14
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -72,25 +73,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
this.logger = logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup);
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
*
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @param locParser Genome Loc Parser
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup,
|
||||
GenomeLocParser locParser);
|
||||
|
||||
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
|
|
@ -54,17 +55,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private final boolean getAlleleListFromVCF;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
|
||||
private final boolean doMultiAllelicCalls;
|
||||
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
|
||||
|
||||
private final int maxAlternateAlleles;
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
|
||||
private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() {
|
||||
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
|
||||
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||
}
|
||||
};
|
||||
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
|
||||
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||
}
|
||||
};
|
||||
|
||||
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
|
||||
|
||||
|
|
@ -87,6 +88,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES;
|
||||
doMultiAllelicCalls = UAC.MULTI_ALLELIC;
|
||||
|
||||
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
|
||||
|
|
@ -95,7 +98,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType) {
|
||||
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) {
|
||||
Allele refAllele=null, altAllele=null;
|
||||
GenomeLoc loc = ref.getLocus();
|
||||
ArrayList<Allele> aList = new ArrayList<Allele>();
|
||||
|
|
@ -114,7 +117,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
|
||||
return aList;
|
||||
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// todo -- warning, can be duplicating expensive partition here
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
|
@ -126,9 +129,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
|
||||
//SAMRecord read = p.getRead();
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
continue;
|
||||
continue;
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -208,63 +211,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
}
|
||||
|
||||
/* if (DEBUG) {
|
||||
int icount = indelPileup.getNumberOfInsertions();
|
||||
int dcount = indelPileup.getNumberOfDeletions();
|
||||
if (icount + dcount > 0)
|
||||
{
|
||||
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
|
||||
System.out.format("#ins: %d, #del:%d\n", insCount, delCount);
|
||||
|
||||
for (int i=0 ; i < eventStrings.size() ; i++ ) {
|
||||
System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second);
|
||||
// int k=0;
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
} */
|
||||
}
|
||||
|
||||
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||
int maxAlleleCnt = 0;
|
||||
String bestAltAllele = "";
|
||||
|
||||
for (String s : consensusIndelStrings.keySet()) {
|
||||
int curCnt = consensusIndelStrings.get(s);
|
||||
if (curCnt > maxAlleleCnt) {
|
||||
maxAlleleCnt = curCnt;
|
||||
bestAltAllele = s;
|
||||
int curCnt = consensusIndelStrings.get(s), stop = 0;
|
||||
// if observed count if above minimum threshold, we will genotype this allele
|
||||
if (curCnt < minIndelCountForGenotyping)
|
||||
continue;
|
||||
|
||||
if (s.startsWith("D")) {
|
||||
// get deletion length
|
||||
int dLen = Integer.valueOf(s.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
|
||||
stop = loc.getStart() + dLen;
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
|
||||
|
||||
if (Allele.acceptableAlleleBases(refBases)) {
|
||||
refAllele = Allele.create(refBases,true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(s)) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(s, false);
|
||||
stop = loc.getStart();
|
||||
}
|
||||
}
|
||||
// if (DEBUG)
|
||||
// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
|
||||
} //gdebug-
|
||||
|
||||
if (maxAlleleCnt < minIndelCountForGenotyping)
|
||||
return aList;
|
||||
|
||||
if (bestAltAllele.startsWith("D")) {
|
||||
// get deletion length
|
||||
int dLen = Integer.valueOf(bestAltAllele.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
|
||||
ArrayList vcAlleles = new ArrayList<Allele>();
|
||||
vcAlleles.add(refAllele);
|
||||
vcAlleles.add(altAllele);
|
||||
|
||||
//System.out.println(new String(ref.getBases()));
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder().source("");
|
||||
builder.loc(loc.getContig(), loc.getStart(), stop);
|
||||
builder.alleles(vcAlleles);
|
||||
builder.referenceBaseForIndel(ref.getBase());
|
||||
builder.noGenotypes();
|
||||
if (doMultiAllelicCalls)
|
||||
vcs.add(builder.make());
|
||||
else {
|
||||
if (curCnt > maxAlleleCnt) {
|
||||
maxAlleleCnt = curCnt;
|
||||
vcs.clear();
|
||||
vcs.add(builder.make());
|
||||
}
|
||||
|
||||
if (Allele.acceptableAlleleBases(refBases)) {
|
||||
refAllele = Allele.create(refBases,true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(bestAltAllele)) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(bestAltAllele, false);
|
||||
}
|
||||
}
|
||||
if (refAllele != null && altAllele != null) {
|
||||
aList.add(0,refAllele);
|
||||
aList.add(1,altAllele);
|
||||
}
|
||||
|
||||
if (vcs.isEmpty())
|
||||
return aList; // nothing else to do, no alleles passed minimum count criterion
|
||||
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
|
||||
|
||||
aList = new ArrayList<Allele>(mergedVC.getAlleles());
|
||||
|
||||
return aList;
|
||||
|
||||
}
|
||||
|
|
@ -277,7 +286,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
boolean useBAQedPileup, GenomeLocParser locParser) {
|
||||
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
|
@ -294,17 +303,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
haplotypeMap.clear();
|
||||
|
||||
if (getAlleleListFromVCF) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
vc = vc_input;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
vc = vc_input;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
alleleList.clear();
|
||||
if (ignoreSNPAllelesWhenGenotypingIndels) {
|
||||
|
|
@ -323,7 +332,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
}
|
||||
else {
|
||||
alleleList = computeConsensusAlleles(ref,contexts, contextType);
|
||||
alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser);
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
}
|
||||
|
|
@ -340,7 +349,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
|
||||
|
||||
refAllele = alleleList.get(0);
|
||||
altAllele = alleleList.get(1);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -66,7 +63,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenotypePriors priors,
|
||||
final Allele alternateAlleleToUse,
|
||||
final boolean useBAQedPileup) {
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
if ( !(priors instanceof DiploidSNPGenotypePriors) )
|
||||
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ public class UnifiedArgumentCollection {
|
|||
* For advanced users only.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false)
|
||||
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false)
|
||||
public boolean MULTI_ALLELIC = false;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -243,7 +243,7 @@ public class UnifiedGenotyperEngine {
|
|||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
}
|
||||
|
||||
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
|
||||
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
|
||||
}
|
||||
|
||||
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue