From 6be5e8286037e1a20b7b85508411db43d16a36c0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 07:42:15 -0400 Subject: [PATCH 1/4] VariantEval scalability optimizations -- StateKey no longer extends TreeMap. It's now a final immutable data structure that caches it's toString and hashcode values. TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. -- NewEvaluationContext has a method makeStateKey() that contains all of the functionality that once was spread around VEUtils -- AnalysisModuleScanner uses an annotationCache to speed up the reflections getAnnotations() call when invoked over and over on the same objects. Still expensive to convert each field to a string for the cache, but the only way around that is a complete refactoring of the toTransversalDone of VE -- VariantEvaluator base class has a cached getSimpleName() function -- VEUtils: general cleanup due to refactoring of StateKey -- VEWalker: much better iteration of map data structures. If you need access to iterate over all key/value pairs use the Map.Entry construct with entrySet. This is far better than iterating over the keys and calling get() on each key. --- .../varianteval/VariantEvalWalker.java | 13 ++-- .../evaluators/VariantEvaluator.java | 9 +++ .../util/AnalysisModuleScanner.java | 14 +++- .../util/NewEvaluationContext.java | 10 +++ .../walkers/varianteval/util/StateKey.java | 74 +++++++++++++++---- .../varianteval/util/VariantEvalUtils.java | 25 +------ 6 files changed, 101 insertions(+), 44 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 2b9f159ac..4bfd90eac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -482,11 +482,13 @@ public class VariantEvalWalker extends RodWalker implements Tr public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); - for ( StateKey stateKey : evaluationContexts.keySet() ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); + for ( Map.Entry ecElt : evaluationContexts.entrySet() ) { + final StateKey stateKey = ecElt.getKey(); + final NewEvaluationContext nec = ecElt.getValue(); for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { ve.finalizeEvaluation(); + final String veName = ve.getSimpleName(); // ve.getClass().getSimpleName(); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); Map datamap = scanner.getData(); @@ -498,7 +500,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (field.get(ve) instanceof TableType) { TableType t = (TableType) field.get(ve); - final String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); + final String subTableName = veName + "." + field.getName(); final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; @@ -539,11 +541,10 @@ public class VariantEvalWalker extends RodWalker implements Tr } } } else { - GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); + GATKReportTable table = report.getTable(veName); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getName(); - + final String columnName = vs.getName(); table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index d5cf685de..226429439 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -8,6 +8,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public abstract class VariantEvaluator { private VariantEvalWalker walker; + private final String simpleName; + + protected VariantEvaluator() { + this.simpleName = getClass().getSimpleName(); + } public void initialize(VariantEvalWalker walker) { this.walker = walker; @@ -90,4 +95,8 @@ public abstract class VariantEvaluator { protected static String formattedRatio(final int num, final int denom) { return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); } + + public String getSimpleName() { + return simpleName; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java index db44e9e28..793bafdd0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java @@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.lang.reflect.Field; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -40,6 +41,7 @@ import java.util.Map; * the object, a Mashalling object can serialize or deserialize a analysis module. */ public class AnalysisModuleScanner { + final private static Map annotationCache = new HashMap(); // what we extracted from the class private Map datums = new LinkedHashMap(); // the data we've discovered @@ -84,12 +86,22 @@ public class AnalysisModuleScanner { // get the fields from the class, and extract for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) { for (Field f : superCls.getDeclaredFields()) - for (Annotation annotation : f.getAnnotations()) { + for (Annotation annotation : getAnnotations(f)) { if (annotation.annotationType().equals(DataPoint.class)) datums.put(f,(DataPoint) annotation); } } } + + private Annotation[] getAnnotations(final Field field) { + final String fieldName = field.toString(); + Annotation[] annotations = annotationCache.get(fieldName); + if ( annotations == null ) { + annotations = field.getAnnotations(); + annotationCache.put(fieldName, annotations); + } + return annotations; + } /** * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java index 09f2c0168..b5c6a1ecf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java @@ -36,6 +36,16 @@ public class NewEvaluationContext extends HashMap { return new TreeMap(evaluationInstances); } + public StateKey makeStateKey() { + Map map = new HashMap(size()); + + for (Map.Entry elt : this.entrySet() ) { + map.put(elt.getKey().getName(), elt.getValue()); + } + + return new StateKey(map); + } + public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { for ( final VariantEvaluator evaluation : evaluationInstances.values() ) { // the other updateN methods don't see a null context diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 96bd9a9b7..36b09300b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -3,25 +3,67 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; import java.util.Map; import java.util.TreeMap; -public class StateKey extends TreeMap { -// public int hashCode() { -// int hashCode = 1; -// -// for (final Map.Entry pair : this.entrySet()) { -// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); -// } -// -// return hashCode; -// } +/** + * A final constant class representing the specific state configuration + * for a VariantEvaluator instance. + * + * TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. + */ +public final class StateKey { + /** High-performance cache of the toString operation for a constant class */ + private final String string; + private final TreeMap states; - public String toString() { - String value = ""; + public StateKey(final Map states) { + this.states = new TreeMap(states); + this.string = formatString(); + } - for ( final String key : this.keySet() ) { - //value += "\tstate " + key + ":" + this.get(key) + "\n"; - value += String.format("%s:%s;", key, this.get(key)); + public StateKey(final StateKey toOverride, final String keyOverride, final String valueOverride) { + if ( toOverride == null ) { + this.states = new TreeMap(); + } else { + this.states = new TreeMap(toOverride.states); } - return value; + this.states.put(keyOverride, valueOverride); + this.string = formatString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final StateKey stateKey = (StateKey) o; + + if (states != null ? !states.equals(stateKey.states) : stateKey.states != null) return false; + + return true; + } + + @Override + public int hashCode() { + return states.hashCode(); + } + + @Override + public String toString() { + return string; + } + + private final String formatString() { + StringBuilder b = new StringBuilder(); + + for ( Map.Entry entry : states.entrySet() ) { + b.append(String.format("%s:%s;", entry.getKey(), entry.getValue())); + } + + return b.toString(); + } + + // TODO -- might be slow because of tree map + public String get(final String key) { + return states.get(key); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 91c7140e6..9b4ae129a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -214,20 +214,9 @@ public class VariantEvalUtils { ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec)); } } else { - HashMap necs = new HashMap(); - - StateKey stateKey = new StateKey(); - for (VariantStratifier vs : ec.keySet()) { - String state = ec.get(vs); - - stateKey.put(vs.getName(), state); - } - + final StateKey stateKey = ec.makeStateKey(); ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); - - necs.put(stateKey, ec); - - return necs; + return new HashMap(Collections.singletonMap(stateKey, ec)); } return ecs; @@ -428,14 +417,8 @@ public class VariantEvalUtils { HashMap> oneSetOfStates = newStateStack.pop(); VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); - for (String state : oneSetOfStates.get(vs)) { - StateKey newStateKey = new StateKey(); - if (stateKey != null) { - newStateKey.putAll(stateKey); - } - - newStateKey.put(vs.getName(), state); - + for (final String state : oneSetOfStates.get(vs)) { + final StateKey newStateKey = new StateKey(stateKey, vs.getName(), state); initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); } } else { From 11b6fd990a6e9985de6c4a4dfbfc01af047c18bb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 10:39:45 -0400 Subject: [PATCH 3/4] GATKReportColumn optimizations -- Was TreeMap even though the sorting wasn't used. Replaced with LinkedHashMap. --- .../broadinstitute/sting/gatk/report/GATKReportColumn.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 9a7c4ced0..2b611109f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.report; import org.apache.commons.lang.math.NumberUtils; -import java.util.Arrays; -import java.util.Collection; -import java.util.TreeMap; +import java.util.*; /** * Holds values for a column in a GATK report table */ -public class GATKReportColumn extends TreeMap { +public class GATKReportColumn extends LinkedHashMap { final private String columnName; final private Object defaultValue; final private String format; From 34ea443cdbb071c435e22597171f6e6d663f63fa Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 16:28:02 -0400 Subject: [PATCH 4/4] Better algorithm for choosing which indel alleles are present in samples -- The previous approach (requiring > 5 copies among all reads) is breaking down in many samples (>1000) just from sequencing errors. -- This breakdown is producing spurious clustered indels (lots of these!) around real common indels -- The new approach requires >X% of reads in a sample to carry an indel of any type (no allele matching) to be including in the counting towards 5. This actually makes sense in that if you have enough data we expect most reads to have the indel, but the allele might be wrong because of alignment, etc. If you have very few reads, then the threshold is crossed with any indel containing read, and it's counted. -- As far as I can tell this is the right thing to do in general. We'll make another call set in ESP and see how it works at scale. -- Added integration tests to ensure that the system is behaving as I expect on the site I developed the code on from ESP --- .../genotyper/ConsensusAlleleCounter.java | 286 ++++++++++++++++++ ...elGenotypeLikelihoodsCalculationModel.java | 202 +------------ .../genotyper/UnifiedArgumentCollection.java | 12 + .../UnifiedGenotyperIntegrationTest.java | 39 ++- 4 files changed, 341 insertions(+), 198 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..3f03c2bb2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + private final GenomeLocParser locParser; + + public ConsensusAlleleCounter(final GenomeLocParser locParser, + final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + this.locParser = locParser; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); +// logger.info("Alleles at " + ref.getLocus()); +// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) { +// logger.info(" " + elt.getValue() + " => " + elt.getKey()); +// } + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for (Map.Entry sample : contexts.entrySet()) { + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) { +// if ( nIndelReads > 0 ) +// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + continue; +// } else { +// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + } + + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + +/* if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + */ + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 21f11d2ff..00d90e3f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,11 +52,9 @@ import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int HAPLOTYPE_SIZE; - private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; @@ -72,7 +70,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup private GenomeLoc lastSiteVisited; - private ArrayList alleleList; + private List alleleList = new ArrayList(); static { indelLikelihoodMap.set(new HashMap>()); @@ -83,205 +81,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - - private ArrayList computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { - Allele refAllele = null, altAllele = null; - GenomeLoc loc = ref.getLocus(); - ArrayList aList = new ArrayList(); - - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - insCount += indelPileup.getNumberOfInsertions(); - delCount += indelPileup.getNumberOfDeletions(); - } - - if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) - return aList; - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { - //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - -/* if (DEBUG && p.isIndel()) { - System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", - read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), - p.getEventLength(),p.getType().toString(), p.getEventBases()); - } - */ - - String indelString = p.getEventBases(); - if (p.isInsertion()) { - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (String s : consensusIndelStrings.keySet()) { - cList.add(new Pair(s,consensusIndelStrings.get(s))); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(indelString)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(indelString)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - cList.add(new Pair(indelString,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); - - } - } - - } - - Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - String bestAltAllele = ""; - - for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s), stop = 0; - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); - - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - if (Allele.acceptableAlleleBases(s)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(s, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - ArrayList vcAlleles = new ArrayList(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(ref.getBase()); - builder.noGenotypes(); - if (doMultiAllelicCalls) - vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - - } - } - - if (vcs.isEmpty()) - return aList; // nothing else to do, no alleles passed minimum count criterion - - VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); - - aList = new ArrayList(mergedVC.getAlleles()); - - return aList; - + private List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenomeLocParser locParser) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 82e411c25..7b8e5c897 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -118,6 +118,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute + * to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end + * up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for + * low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over + * come this threshold. + */ + @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) + public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; + + /** * This argument informs the prior probability of having an indel at a site. */ @@ -165,6 +176,7 @@ public class UnifiedArgumentCollection { uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; + uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b3bd0253c..67cd40eea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -17,8 +17,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); + Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b")); executeTest("test Multiple SNP alleles", spec); } @@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { UserException.class); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("f08ff07ad49d388198c1887baad05977")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a")); + executeTest("test minIndelFraction 1.0", spec); + } }