From d4e7655d14d9bb9c30a79d53fafe309dd9922fa9 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 6 Jan 2012 11:24:38 -0500 Subject: [PATCH] Added ability to call multiallelic indels, if -multiallelic is included in UG arguments. Simple idea: we genotype all alleles with count >= minIndelCnt. To support this, refactored code that computes consensus alleles. To ease merging of mulitple alt alleles, we create a single vc for each alt alleles and then use VariantContextUtils.simpleMerge to carry out merging, which takes care of handling all corner conditions already. In order to use this, interface to GenotypeLikelihoodsCalculationModel changed to pass in a GenomeLocParser object (why are these objects to hard to handle??). More testing is required and feature turned off my default. --- .../GenotypeLikelihoodsCalculationModel.java | 42 ++--- ...elGenotypeLikelihoodsCalculationModel.java | 147 ++++++++++-------- ...NPGenotypeLikelihoodsCalculationModel.java | 8 +- .../genotyper/UnifiedArgumentCollection.java | 2 +- .../genotyper/UnifiedGenotyperEngine.java | 2 +- 5 files changed, 106 insertions(+), 95 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index b30a25414..ace780dd0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -72,25 +73,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { this.logger = logger; } - /** - * Must be overridden by concrete subclasses - * - * @param tracker rod data - * @param ref reference context - * @param contexts stratified alignment contexts - * @param contextType stratified context type - * @param priors priors to use for GLs - * @param alternateAlleleToUse the alternate allele to use, null if not set - * @param useBAQedPileup should we use the BAQed pileup or the raw one? - * @return variant context where genotypes are no-called but with GLs - */ - public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup); + /** + * Can be overridden by concrete subclasses + * + * @param tracker rod data + * @param ref reference context + * @param contexts stratified alignment contexts + * @param contextType stratified context type + * @param priors priors to use for GLs + * @param alternateAlleleToUse the alternate allele to use, null if not set + * @param useBAQedPileup should we use the BAQed pileup or the raw one? + * @param locParser Genome Loc Parser + * @return variant context where genotypes are no-called but with GLs + */ + public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup, + GenomeLocParser locParser); + protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 8d279005b..0756caf03 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -54,17 +55,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - + private final boolean doMultiAllelicCalls; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; - + private final int maxAlternateAlleles; private PairHMMIndelErrorModel pairModel; private static ThreadLocal>> indelLikelihoodMap = new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); - } - }; + protected synchronized HashMap> initialValue() { + return new HashMap>(); + } + }; private LinkedHashMap haplotypeMap; @@ -87,6 +88,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; + maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES; + doMultiAllelicCalls = UAC.MULTI_ALLELIC; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; @@ -95,7 +98,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private ArrayList computeConsensusAlleles(ReferenceContext ref, Map contexts, - AlignmentContextUtils.ReadOrientation contextType) { + AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { Allele refAllele=null, altAllele=null; GenomeLoc loc = ref.getLocus(); ArrayList aList = new ArrayList(); @@ -114,7 +117,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; - + for ( Map.Entry sample : contexts.entrySet() ) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); @@ -126,9 +129,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) - continue; + continue; if(ReadUtils.is454Read(read)) { continue; } @@ -208,63 +211,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } -/* if (DEBUG) { - int icount = indelPileup.getNumberOfInsertions(); - int dcount = indelPileup.getNumberOfDeletions(); - if (icount + dcount > 0) - { - List> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); - System.out.format("#ins: %d, #del:%d\n", insCount, delCount); - - for (int i=0 ; i < eventStrings.size() ; i++ ) { - System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); - // int k=0; - } - System.out.println(); - } - } */ } + Collection vcs = new ArrayList(); int maxAlleleCnt = 0; String bestAltAllele = ""; + for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s); - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - bestAltAllele = s; + int curCnt = consensusIndelStrings.get(s), stop = 0; + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases,true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + } + else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } } -// if (DEBUG) -// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); - } //gdebug- - if (maxAlleleCnt < minIndelCountForGenotyping) - return aList; - if (bestAltAllele.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(bestAltAllele.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); + ArrayList vcAlleles = new ArrayList(); + vcAlleles.add(refAllele); + vcAlleles.add(altAllele); - //System.out.println(new String(ref.getBases())); - byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(vcAlleles); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases,true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } } - else { - // insertion case - if (Allele.acceptableAlleleBases(bestAltAllele)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(bestAltAllele, false); - } - } - if (refAllele != null && altAllele != null) { - aList.add(0,refAllele); - aList.add(1,altAllele); - } + + if (vcs.isEmpty()) + return aList; // nothing else to do, no alleles passed minimum count criterion + + VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + + aList = new ArrayList(mergedVC.getAlleles()); + return aList; } @@ -277,7 +286,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Allele alternateAlleleToUse, - boolean useBAQedPileup) { + boolean useBAQedPileup, GenomeLocParser locParser) { if ( tracker == null ) return null; @@ -294,17 +303,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood haplotypeMap.clear(); if (getAlleleListFromVCF) { - for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { - if( vc_input != null && - allowableTypes.contains(vc_input.getType()) && - ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; - } - } - // ignore places where we don't have a variant - if ( vc == null ) - return null; + for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { + if( vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if ( vc == null ) + return null; alleleList.clear(); if (ignoreSNPAllelesWhenGenotypingIndels) { @@ -323,7 +332,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } else { - alleleList = computeConsensusAlleles(ref,contexts, contextType); + alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser); if (alleleList.isEmpty()) return null; } @@ -340,7 +349,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (alleleList.isEmpty()) return null; - + refAllele = alleleList.get(0); altAllele = alleleList.get(1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index eee89674a..81c766e4d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -66,7 +63,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final AlignmentContextUtils.ReadOrientation contextType, final GenotypePriors priors, final Allele alternateAlleleToUse, - final boolean useBAQedPileup) { + final boolean useBAQedPileup, + final GenomeLocParser locParser) { if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 4639d67a7..16159393f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -109,7 +109,7 @@ public class UnifiedArgumentCollection { * For advanced users only. */ @Advanced - @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false) + @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false) public boolean MULTI_ALLELIC = false; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 34be88dbb..5d73e8d28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -243,7 +243,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); + return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) {