Added ability to call multiallelic indels, if -multiallelic is included in UG arguments. Simple idea: we genotype all alleles with count >= minIndelCnt.

To support this, refactored code that computes consensus alleles. To ease merging of mulitple alt alleles, we create a single vc for each alt alleles and then use VariantContextUtils.simpleMerge to carry out merging, which takes care of handling all corner conditions already. In order to use this, interface to GenotypeLikelihoodsCalculationModel changed to pass in a GenomeLocParser object (why are these objects to hard to handle??).
More testing is required and feature turned off my default.
This commit is contained in:
Guillermo del Angel 2012-01-06 11:24:38 -05:00
parent 43224ef364
commit d4e7655d14
5 changed files with 106 additions and 95 deletions

View File

@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -72,25 +73,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
this.logger = logger; this.logger = logger;
} }
/** /**
* Must be overridden by concrete subclasses * Can be overridden by concrete subclasses
* *
* @param tracker rod data * @param tracker rod data
* @param ref reference context * @param ref reference context
* @param contexts stratified alignment contexts * @param contexts stratified alignment contexts
* @param contextType stratified context type * @param contextType stratified context type
* @param priors priors to use for GLs * @param priors priors to use for GLs
* @param alternateAlleleToUse the alternate allele to use, null if not set * @param alternateAlleleToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one? * @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @return variant context where genotypes are no-called but with GLs * @param locParser Genome Loc Parser
*/ * @return variant context where genotypes are no-called but with GLs
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, */
ReferenceContext ref, public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
Map<String, AlignmentContext> contexts, ReferenceContext ref,
AlignmentContextUtils.ReadOrientation contextType, Map<String, AlignmentContext> contexts,
GenotypePriors priors, AlignmentContextUtils.ReadOrientation contextType,
Allele alternateAlleleToUse, GenotypePriors priors,
boolean useBAQedPileup); Allele alternateAlleleToUse,
boolean useBAQedPileup,
GenomeLocParser locParser);
protected int getFilteredDepth(ReadBackedPileup pileup) { protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0; int count = 0;

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
@ -54,17 +55,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private final boolean getAlleleListFromVCF; private final boolean getAlleleListFromVCF;
private boolean DEBUG = false; private boolean DEBUG = false;
private final boolean doMultiAllelicCalls;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private final int maxAlternateAlleles;
private PairHMMIndelErrorModel pairModel; private PairHMMIndelErrorModel pairModel;
private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap = private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap =
new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() { new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() {
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() { protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>(); return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
} }
}; };
private LinkedHashMap<Allele,Haplotype> haplotypeMap; private LinkedHashMap<Allele,Haplotype> haplotypeMap;
@ -87,6 +88,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES;
doMultiAllelicCalls = UAC.MULTI_ALLELIC;
haplotypeMap = new LinkedHashMap<Allele,Haplotype>(); haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
@ -95,7 +98,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref, private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts, Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) { AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) {
Allele refAllele=null, altAllele=null; Allele refAllele=null, altAllele=null;
GenomeLoc loc = ref.getLocus(); GenomeLoc loc = ref.getLocus();
ArrayList<Allele> aList = new ArrayList<Allele>(); ArrayList<Allele> aList = new ArrayList<Allele>();
@ -114,7 +117,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return aList; return aList;
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
// todo -- warning, can be duplicating expensive partition here // todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
@ -126,9 +129,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
//SAMRecord read = p.getRead(); //SAMRecord read = p.getRead();
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null) if (read == null)
continue; continue;
if(ReadUtils.is454Read(read)) { if(ReadUtils.is454Read(read)) {
continue; continue;
} }
@ -208,63 +211,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
} }
} }
/* if (DEBUG) {
int icount = indelPileup.getNumberOfInsertions();
int dcount = indelPileup.getNumberOfDeletions();
if (icount + dcount > 0)
{
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
System.out.format("#ins: %d, #del:%d\n", insCount, delCount);
for (int i=0 ; i < eventStrings.size() ; i++ ) {
System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second);
// int k=0;
}
System.out.println();
}
} */
} }
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0; int maxAlleleCnt = 0;
String bestAltAllele = ""; String bestAltAllele = "";
for (String s : consensusIndelStrings.keySet()) { for (String s : consensusIndelStrings.keySet()) {
int curCnt = consensusIndelStrings.get(s); int curCnt = consensusIndelStrings.get(s), stop = 0;
if (curCnt > maxAlleleCnt) { // if observed count if above minimum threshold, we will genotype this allele
maxAlleleCnt = curCnt; if (curCnt < minIndelCountForGenotyping)
bestAltAllele = s; continue;
if (s.startsWith("D")) {
// get deletion length
int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
stop = loc.getStart() + dLen;
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases,true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
}
else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
} }
// if (DEBUG)
// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
} //gdebug-
if (maxAlleleCnt < minIndelCountForGenotyping)
return aList;
if (bestAltAllele.startsWith("D")) { ArrayList vcAlleles = new ArrayList<Allele>();
// get deletion length vcAlleles.add(refAllele);
int dLen = Integer.valueOf(bestAltAllele.substring(1)); vcAlleles.add(altAllele);
// get ref bases of accurate deletion
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
//System.out.println(new String(ref.getBases())); final VariantContextBuilder builder = new VariantContextBuilder().source("");
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(vcAlleles);
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls)
vcs.add(builder.make());
else {
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases,true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
} }
} }
else {
// insertion case if (vcs.isEmpty())
if (Allele.acceptableAlleleBases(bestAltAllele)) { return aList; // nothing else to do, no alleles passed minimum count criterion
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(bestAltAllele, false); VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
}
} aList = new ArrayList<Allele>(mergedVC.getAlleles());
if (refAllele != null && altAllele != null) {
aList.add(0,refAllele);
aList.add(1,altAllele);
}
return aList; return aList;
} }
@ -277,7 +286,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
AlignmentContextUtils.ReadOrientation contextType, AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors, GenotypePriors priors,
Allele alternateAlleleToUse, Allele alternateAlleleToUse,
boolean useBAQedPileup) { boolean useBAQedPileup, GenomeLocParser locParser) {
if ( tracker == null ) if ( tracker == null )
return null; return null;
@ -294,17 +303,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
haplotypeMap.clear(); haplotypeMap.clear();
if (getAlleleListFromVCF) { if (getAlleleListFromVCF) {
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
if( vc_input != null && if( vc_input != null &&
allowableTypes.contains(vc_input.getType()) && allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) { ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input; vc = vc_input;
break; break;
} }
} }
// ignore places where we don't have a variant // ignore places where we don't have a variant
if ( vc == null ) if ( vc == null )
return null; return null;
alleleList.clear(); alleleList.clear();
if (ignoreSNPAllelesWhenGenotypingIndels) { if (ignoreSNPAllelesWhenGenotypingIndels) {
@ -323,7 +332,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
} }
else { else {
alleleList = computeConsensusAlleles(ref,contexts, contextType); alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser);
if (alleleList.isEmpty()) if (alleleList.isEmpty())
return null; return null;
} }
@ -340,7 +349,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (alleleList.isEmpty()) if (alleleList.isEmpty())
return null; return null;
refAllele = alleleList.get(0); refAllele = alleleList.get(0);
altAllele = alleleList.get(1); altAllele = alleleList.get(1);

View File

@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.StingException;
@ -66,7 +63,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final AlignmentContextUtils.ReadOrientation contextType, final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors, final GenotypePriors priors,
final Allele alternateAlleleToUse, final Allele alternateAlleleToUse,
final boolean useBAQedPileup) { final boolean useBAQedPileup,
final GenomeLocParser locParser) {
if ( !(priors instanceof DiploidSNPGenotypePriors) ) if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");

View File

@ -109,7 +109,7 @@ public class UnifiedArgumentCollection {
* For advanced users only. * For advanced users only.
*/ */
@Advanced @Advanced
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false) @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false)
public boolean MULTI_ALLELIC = false; public boolean MULTI_ALLELIC = false;
/** /**

View File

@ -243,7 +243,7 @@ public class UnifiedGenotyperEngine {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
} }
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
} }
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) { private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {