From 6be5e8286037e1a20b7b85508411db43d16a36c0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 07:42:15 -0400 Subject: [PATCH 01/14] VariantEval scalability optimizations -- StateKey no longer extends TreeMap. It's now a final immutable data structure that caches it's toString and hashcode values. TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. -- NewEvaluationContext has a method makeStateKey() that contains all of the functionality that once was spread around VEUtils -- AnalysisModuleScanner uses an annotationCache to speed up the reflections getAnnotations() call when invoked over and over on the same objects. Still expensive to convert each field to a string for the cache, but the only way around that is a complete refactoring of the toTransversalDone of VE -- VariantEvaluator base class has a cached getSimpleName() function -- VEUtils: general cleanup due to refactoring of StateKey -- VEWalker: much better iteration of map data structures. If you need access to iterate over all key/value pairs use the Map.Entry construct with entrySet. This is far better than iterating over the keys and calling get() on each key. --- .../varianteval/VariantEvalWalker.java | 13 ++-- .../evaluators/VariantEvaluator.java | 9 +++ .../util/AnalysisModuleScanner.java | 14 +++- .../util/NewEvaluationContext.java | 10 +++ .../walkers/varianteval/util/StateKey.java | 74 +++++++++++++++---- .../varianteval/util/VariantEvalUtils.java | 25 +------ 6 files changed, 101 insertions(+), 44 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 2b9f159ac..4bfd90eac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -482,11 +482,13 @@ public class VariantEvalWalker extends RodWalker implements Tr public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); - for ( StateKey stateKey : evaluationContexts.keySet() ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); + for ( Map.Entry ecElt : evaluationContexts.entrySet() ) { + final StateKey stateKey = ecElt.getKey(); + final NewEvaluationContext nec = ecElt.getValue(); for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { ve.finalizeEvaluation(); + final String veName = ve.getSimpleName(); // ve.getClass().getSimpleName(); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); Map datamap = scanner.getData(); @@ -498,7 +500,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (field.get(ve) instanceof TableType) { TableType t = (TableType) field.get(ve); - final String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); + final String subTableName = veName + "." + field.getName(); final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; @@ -539,11 +541,10 @@ public class VariantEvalWalker extends RodWalker implements Tr } } } else { - GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); + GATKReportTable table = report.getTable(veName); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getName(); - + final String columnName = vs.getName(); table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index d5cf685de..226429439 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -8,6 +8,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public abstract class VariantEvaluator { private VariantEvalWalker walker; + private final String simpleName; + + protected VariantEvaluator() { + this.simpleName = getClass().getSimpleName(); + } public void initialize(VariantEvalWalker walker) { this.walker = walker; @@ -90,4 +95,8 @@ public abstract class VariantEvaluator { protected static String formattedRatio(final int num, final int denom) { return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); } + + public String getSimpleName() { + return simpleName; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java index db44e9e28..793bafdd0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java @@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.lang.reflect.Field; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -40,6 +41,7 @@ import java.util.Map; * the object, a Mashalling object can serialize or deserialize a analysis module. */ public class AnalysisModuleScanner { + final private static Map annotationCache = new HashMap(); // what we extracted from the class private Map datums = new LinkedHashMap(); // the data we've discovered @@ -84,12 +86,22 @@ public class AnalysisModuleScanner { // get the fields from the class, and extract for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) { for (Field f : superCls.getDeclaredFields()) - for (Annotation annotation : f.getAnnotations()) { + for (Annotation annotation : getAnnotations(f)) { if (annotation.annotationType().equals(DataPoint.class)) datums.put(f,(DataPoint) annotation); } } } + + private Annotation[] getAnnotations(final Field field) { + final String fieldName = field.toString(); + Annotation[] annotations = annotationCache.get(fieldName); + if ( annotations == null ) { + annotations = field.getAnnotations(); + annotationCache.put(fieldName, annotations); + } + return annotations; + } /** * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java index 09f2c0168..b5c6a1ecf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java @@ -36,6 +36,16 @@ public class NewEvaluationContext extends HashMap { return new TreeMap(evaluationInstances); } + public StateKey makeStateKey() { + Map map = new HashMap(size()); + + for (Map.Entry elt : this.entrySet() ) { + map.put(elt.getKey().getName(), elt.getValue()); + } + + return new StateKey(map); + } + public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { for ( final VariantEvaluator evaluation : evaluationInstances.values() ) { // the other updateN methods don't see a null context diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 96bd9a9b7..36b09300b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -3,25 +3,67 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; import java.util.Map; import java.util.TreeMap; -public class StateKey extends TreeMap { -// public int hashCode() { -// int hashCode = 1; -// -// for (final Map.Entry pair : this.entrySet()) { -// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); -// } -// -// return hashCode; -// } +/** + * A final constant class representing the specific state configuration + * for a VariantEvaluator instance. + * + * TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. + */ +public final class StateKey { + /** High-performance cache of the toString operation for a constant class */ + private final String string; + private final TreeMap states; - public String toString() { - String value = ""; + public StateKey(final Map states) { + this.states = new TreeMap(states); + this.string = formatString(); + } - for ( final String key : this.keySet() ) { - //value += "\tstate " + key + ":" + this.get(key) + "\n"; - value += String.format("%s:%s;", key, this.get(key)); + public StateKey(final StateKey toOverride, final String keyOverride, final String valueOverride) { + if ( toOverride == null ) { + this.states = new TreeMap(); + } else { + this.states = new TreeMap(toOverride.states); } - return value; + this.states.put(keyOverride, valueOverride); + this.string = formatString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final StateKey stateKey = (StateKey) o; + + if (states != null ? !states.equals(stateKey.states) : stateKey.states != null) return false; + + return true; + } + + @Override + public int hashCode() { + return states.hashCode(); + } + + @Override + public String toString() { + return string; + } + + private final String formatString() { + StringBuilder b = new StringBuilder(); + + for ( Map.Entry entry : states.entrySet() ) { + b.append(String.format("%s:%s;", entry.getKey(), entry.getValue())); + } + + return b.toString(); + } + + // TODO -- might be slow because of tree map + public String get(final String key) { + return states.get(key); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 91c7140e6..9b4ae129a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -214,20 +214,9 @@ public class VariantEvalUtils { ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec)); } } else { - HashMap necs = new HashMap(); - - StateKey stateKey = new StateKey(); - for (VariantStratifier vs : ec.keySet()) { - String state = ec.get(vs); - - stateKey.put(vs.getName(), state); - } - + final StateKey stateKey = ec.makeStateKey(); ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); - - necs.put(stateKey, ec); - - return necs; + return new HashMap(Collections.singletonMap(stateKey, ec)); } return ecs; @@ -428,14 +417,8 @@ public class VariantEvalUtils { HashMap> oneSetOfStates = newStateStack.pop(); VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); - for (String state : oneSetOfStates.get(vs)) { - StateKey newStateKey = new StateKey(); - if (stateKey != null) { - newStateKey.putAll(stateKey); - } - - newStateKey.put(vs.getName(), state); - + for (final String state : oneSetOfStates.get(vs)) { + final StateKey newStateKey = new StateKey(stateKey, vs.getName(), state); initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); } } else { From 11b6fd990a6e9985de6c4a4dfbfc01af047c18bb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 10:39:45 -0400 Subject: [PATCH 03/14] GATKReportColumn optimizations -- Was TreeMap even though the sorting wasn't used. Replaced with LinkedHashMap. --- .../broadinstitute/sting/gatk/report/GATKReportColumn.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 9a7c4ced0..2b611109f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.report; import org.apache.commons.lang.math.NumberUtils; -import java.util.Arrays; -import java.util.Collection; -import java.util.TreeMap; +import java.util.*; /** * Holds values for a column in a GATK report table */ -public class GATKReportColumn extends TreeMap { +public class GATKReportColumn extends LinkedHashMap { final private String columnName; final private Object defaultValue; final private String format; From 34ea443cdbb071c435e22597171f6e6d663f63fa Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 16:28:02 -0400 Subject: [PATCH 04/14] Better algorithm for choosing which indel alleles are present in samples -- The previous approach (requiring > 5 copies among all reads) is breaking down in many samples (>1000) just from sequencing errors. -- This breakdown is producing spurious clustered indels (lots of these!) around real common indels -- The new approach requires >X% of reads in a sample to carry an indel of any type (no allele matching) to be including in the counting towards 5. This actually makes sense in that if you have enough data we expect most reads to have the indel, but the allele might be wrong because of alignment, etc. If you have very few reads, then the threshold is crossed with any indel containing read, and it's counted. -- As far as I can tell this is the right thing to do in general. We'll make another call set in ESP and see how it works at scale. -- Added integration tests to ensure that the system is behaving as I expect on the site I developed the code on from ESP --- .../genotyper/ConsensusAlleleCounter.java | 286 ++++++++++++++++++ ...elGenotypeLikelihoodsCalculationModel.java | 202 +------------ .../genotyper/UnifiedArgumentCollection.java | 12 + .../UnifiedGenotyperIntegrationTest.java | 39 ++- 4 files changed, 341 insertions(+), 198 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..3f03c2bb2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + private final GenomeLocParser locParser; + + public ConsensusAlleleCounter(final GenomeLocParser locParser, + final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + this.locParser = locParser; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); +// logger.info("Alleles at " + ref.getLocus()); +// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) { +// logger.info(" " + elt.getValue() + " => " + elt.getKey()); +// } + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for (Map.Entry sample : contexts.entrySet()) { + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) { +// if ( nIndelReads > 0 ) +// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + continue; +// } else { +// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + } + + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + +/* if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + */ + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 21f11d2ff..00d90e3f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,11 +52,9 @@ import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int HAPLOTYPE_SIZE; - private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; @@ -72,7 +70,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup private GenomeLoc lastSiteVisited; - private ArrayList alleleList; + private List alleleList = new ArrayList(); static { indelLikelihoodMap.set(new HashMap>()); @@ -83,205 +81,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - - private ArrayList computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { - Allele refAllele = null, altAllele = null; - GenomeLoc loc = ref.getLocus(); - ArrayList aList = new ArrayList(); - - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - insCount += indelPileup.getNumberOfInsertions(); - delCount += indelPileup.getNumberOfDeletions(); - } - - if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) - return aList; - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { - //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - -/* if (DEBUG && p.isIndel()) { - System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", - read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), - p.getEventLength(),p.getType().toString(), p.getEventBases()); - } - */ - - String indelString = p.getEventBases(); - if (p.isInsertion()) { - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (String s : consensusIndelStrings.keySet()) { - cList.add(new Pair(s,consensusIndelStrings.get(s))); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(indelString)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(indelString)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - cList.add(new Pair(indelString,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); - - } - } - - } - - Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - String bestAltAllele = ""; - - for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s), stop = 0; - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); - - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - if (Allele.acceptableAlleleBases(s)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(s, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - ArrayList vcAlleles = new ArrayList(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(ref.getBase()); - builder.noGenotypes(); - if (doMultiAllelicCalls) - vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - - } - } - - if (vcs.isEmpty()) - return aList; // nothing else to do, no alleles passed minimum count criterion - - VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); - - aList = new ArrayList(mergedVC.getAlleles()); - - return aList; - + private List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenomeLocParser locParser) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 82e411c25..7b8e5c897 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -118,6 +118,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute + * to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end + * up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for + * low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over + * come this threshold. + */ + @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) + public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; + + /** * This argument informs the prior probability of having an indel at a site. */ @@ -165,6 +176,7 @@ public class UnifiedArgumentCollection { uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; + uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b3bd0253c..67cd40eea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -17,8 +17,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); + Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b")); executeTest("test Multiple SNP alleles", spec); } @@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { UserException.class); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("f08ff07ad49d388198c1887baad05977")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a")); + executeTest("test minIndelFraction 1.0", spec); + } } From 1a2a4848e814ac9bd5ab29455ef560bd5d402b2e Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 26 Mar 2012 16:39:55 -0400 Subject: [PATCH 05/14] Added integration test for ValidationSiteSelector, correct MD5's --- ...ValidationSiteSelectorIntegrationTest.java | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java new file mode 100644 index 000000000..d7c866a0a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.gatk.walkers.validation; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 3/26/12 + * Time: 3:29 PM + * To change this template use File | Settings | File Templates. + */ +public class ValidationSiteSelectorIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T ValidationSiteSelector -R " + b36KGReference + " -L 1 -o %s -NO_HEADER -numSites 100 " + args; + } + + private static String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + private static String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + private static String samplePrefix = " -sf " + samplesFile; + private static String freqUnif = " --frequencySelectionMode UNIFORM "; + private static String freqAF = " --frequencySelectionMode KEEP_AF_SPECTRUM "; + private static String sampleNone = " -sampleMode NONE "; + private static String sampleGT = samplePrefix+" -sampleMode POLY_BASED_ON_GT "; + private static String sampleGL = samplePrefix+" -sampleMode POLY_BASED_ON_GL -samplePNonref 0.95"; + + + @Test(enabled=true) + public void testNoSampleSelectionFreqUniform() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleNone + freqUnif + "--variant " + testfile), + 1, + Arrays.asList("d49baeb8000a426c172ce1d81eb37963") + ); + + executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec); + } + + @Test(enabled=true) + public void testNoSampleSelectionFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleNone + freqAF + "--variant " + testfile), + 1, + Arrays.asList("0fb0d015d462c34514fc7e96beea5f56") + ); + + executeTest("testNoSampleSelectionFreqAF--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGTFreqUniform() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGT + freqUnif + "--variant " + testfile), + 1, + Arrays.asList("0672854299d42ea8af906976a3849ae6") + ); + + executeTest("testPolyGTFreqUniform--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGTFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGT + freqAF + "--variant " + testfile), + 1, + Arrays.asList("5bdffda1a063d0bddd6b236854ec627d") + ); + + executeTest("testPolyGTFreqAF--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGLFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGL + freqAF + "--variant " + testfile), + 1, + Arrays.asList("35ef16aa41303606a4b94f7b88bd9aa8") + ); + + executeTest("testPolyGLFreqAF--" + testfile, spec); + } + +} From c07a577ba36338309ed94c5fb13fcd8890b63947 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Mar 2012 00:27:44 -0500 Subject: [PATCH 06/14] Significant restructuring of the Exact model, as discussed within the dev group last week. There is no more marginalizing over alternate alleles, and we now keep track of the MLE and MAP. Important notes: 1) integration tests change because the previous marginalization wasn't done correctly (as pointed out by Guillermo) and our confidences were too high for many multi-allelic sites; 2) there is a major TO-DO item that needs to be discussed within the dev group (so they should expect a follow up email); 3) this code is still in flux as I am awaiting feedback from Ryan now on its performance with the Haplotype Caller (the good news, Ryan, is that we recover that site that we were losing previously). --- .../AlleleFrequencyCalculationModel.java | 2 +- .../AlleleFrequencyCalculationResult.java | 109 +++++++-- .../genotyper/ExactAFCalculationModel.java | 78 +++---- .../gatk/walkers/genotyper/UGBoundAF.java | 209 ------------------ .../walkers/genotyper/UnifiedGenotyper.java | 3 +- .../genotyper/UnifiedGenotyperEngine.java | 103 ++++----- .../GLBasedSampleSelector.java | 24 +- .../ExactAFCalculationModelUnitTest.java | 21 +- .../UnifiedGenotyperIntegrationTest.java | 14 +- 9 files changed, 182 insertions(+), 381 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 9f2403bbf..e1ce2ee18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -69,6 +69,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @return the alleles used for genotyping */ protected abstract List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 9c4af8512..0867d949e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -25,6 +25,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.ArrayList; +import java.util.Arrays; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -34,23 +39,54 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; */ public class AlleleFrequencyCalculationResult { - // IMPORTANT NOTE: - // These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N). - // For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that - // the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may - // be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0). - // In the bi-allelic case (where there are no other alternate alleles over which to marginalize), - // the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED. - final double[][] log10AlleleFrequencyLikelihoods; - final double[][] log10AlleleFrequencyPosteriors; + // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles + private double log10MLE; + private double log10MAP; + final private int[] alleleCountsOfMLE; + final private int[] alleleCountsOfMAP; - // These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - double log10LikelihoodOfAFzero = 0.0; - double log10PosteriorOfAFzero = 0.0; + // The posteriors seen, not including that of AF=0 + // TODO -- better implementation needed here (see below) + private ArrayList log10PosteriorMatrixValues = new ArrayList(100000); + private Double log10PosteriorMatrixSum = null; - public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { - log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; - log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + private double log10LikelihoodOfAFzero; + private double log10PosteriorOfAFzero; + + + public AlleleFrequencyCalculationResult(final int maxAltAlleles) { + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + reset(); + } + + public double getLog10MLE() { + return log10MLE; + } + + public double getLog10MAP() { + return log10MAP; + } + + public double getLog10PosteriorMatrixSum() { + if ( log10PosteriorMatrixSum == null ) { + // TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory; + // TODO -- will discuss with the team what the best option is + final double[] tmp = new double[log10PosteriorMatrixValues.size()]; + for ( int i = 0; i < tmp.length; i++ ) + tmp[i] = log10PosteriorMatrixValues.get(i); + log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp); + } + return log10PosteriorMatrixSum; + } + + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; } public double getLog10LikelihoodOfAFzero() { @@ -60,4 +96,47 @@ public class AlleleFrequencyCalculationResult { public double getLog10PosteriorOfAFzero() { return log10PosteriorOfAFzero; } + + public void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { + alleleCountsOfMLE[i] = 0; + alleleCountsOfMAP[i] = 0; + } + log10PosteriorMatrixValues.clear(); + log10PosteriorMatrixSum = null; + } + + public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMLE[i] = alleleCountsForK[i]; + } + } + + public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + log10PosteriorMatrixValues.add(log10LofK); + if ( log10LofK > log10MAP ) { + log10MAP = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMAP[i] = alleleCountsForK[i]; + } + } + + public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 6c7dc0dcd..891159512 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -43,7 +43,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } public List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { GenotypesContext GLs = vc.getGenotypes(); @@ -59,7 +59,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); } - //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); return alleles; @@ -207,20 +206,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - // TODO -- remove me public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, - final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final boolean foo) { - linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result); - } - - - - public static void linearExactMultiAllelic(final GenotypesContext GLs, - final int numAlternateAlleles, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { final ArrayList genotypeLikelihoods = getGLs(GLs); @@ -272,7 +260,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { //if ( DEBUG ) @@ -360,7 +348,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case @@ -370,47 +358,39 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( totalK == 0 ) { for ( int j = 1; j < set.log10Likelihoods.length; j++ ) set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; } - // k > 0 for at least one k - else { - // the non-AA possible conformations were dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); - } + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; } - final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - // determine the power of theta to use - int nonRefAlleles = 0; - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - if ( set.ACcounts.getCounts()[i] > 0 ) - nonRefAlleles++; - } - - // for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object - if ( nonRefAlleles == 0 ) { - result.log10LikelihoodOfAFzero = log10LofK; - result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0]; - } else { - // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - int AC = set.ACcounts.getCounts()[i]; - result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); - - final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); - } + // update the MLE if necessary + result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); + + // apply the priors over each alternate allele + for ( final int ACcount : set.ACcounts.getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; } + result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); } private static void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java deleted file mode 100755 index 99d55bc69..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.commons.lang.NotImplementedException; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.security.cert.CertificateNotYetValidException; -import java.util.*; - -import org.broadinstitute.sting.utils.codecs.vcf.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 8/30/11 - * Time: 10:08 AM - * To change this template use File | Settings | File Templates. - */ -public class UGBoundAF extends RodWalker { - - @Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true) - VCFWriter writer; - - @Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true) - List> variants; - - private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0); - - private HashMap> epsilonPosteriorCache = new HashMap>(8192); - private HashMap logAC0Cache = new HashMap(8192); - private int QUANTIZATION_FACTOR = 1000; - - - public void initialize() { - Set allHeaderLines = new HashSet(1024); - for ( RodBinding v : variants ) { - String trackName = v.getName(); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set headerLines = new HashSet(vcfHeaders.get(trackName).getMetaData()); - } - allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+ - "frequency. First value=95% probability AF>x. Second value=95% probability AF allVariants = tracker.getValues(variants); - if ( allVariants.size() == 0 ) { - return null; - } - - List alternateAlleles = getAllAlternateAlleles(allVariants); - VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet())); - if ( alternateAlleles.size() > 1 ) { - logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString()); - alternateAlleles.add(allVariants.get(0).getReference()); - builder.alleles(alternateAlleles); - builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles))); - } else { - // get all the genotype likelihoods - GenotypesContext context = GenotypesContext.create(); - int numNoCall = 0; - for ( VariantContext v : allVariants ) { - numNoCall += v.getNoCallCount(); - context.addAll(v.getGenotypes()); - } - builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context))); - } - - return builder.make(); - } - - private List getAllAlternateAlleles(List variants) { - List alleles = new ArrayList(3); // some overhead - for ( VariantContext v : variants ) { - alleles.addAll(v.getAlternateAlleles()); - } - return alleles; - } - - @Override - public Integer reduce(VariantContext value, Integer sum) { - if ( value == null ) - return sum; - writer.add(value); - return ++sum; - } - - private int N_ITERATIONS = 1; - private double[] getACPosteriors(GenotypesContext gc) { - // note this uses uniform priors (!) - - double[][] zeroPriors = new double[1][1+2*gc.size()]; - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size()); - // todo -- allow multiple alleles here - for ( int i = 0; i < N_ITERATIONS; i ++ ) { - ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false); - } - - return result.log10AlleleFrequencyPosteriors[0]; - } - - private String boundAlleleFrequency(double[] ACLikelihoods) { - // note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs - // thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be - // agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over - - // want to pick a lower limit x and upper limit y such that - // int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95 - // int_{f=x to y} calculateAFPosterior(f) df = 0.95 - // and that (y-x) is minimized - - // this is done by quantizing [0,1] into small bins and, since the distribution is - // unimodal, greedily adding them until the probability is >= 0.95 - - throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented")); - } - - private double calculateAFPosterior(double[] likelihoods, double af) { - double[] probLiks = new double[likelihoods.length]; - for ( int c = 0; c < likelihoods.length; c++) { - probLiks[c] = calculateAFPosterior(c,likelihoods.length,af); - } - - return MathUtils.log10sumLog10(probLiks); - } - - private double calculateAFPosterior(int ac, int n, double af) { - // evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes - switch ( ac ) { - case 0: - return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af); - case 1: - return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT); - case 2: - return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT); - default: - return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac); - } - } - - private double logAC0Coef(int an) { - if ( ! logAC0Cache.containsKey(an) ) { - double coef = -Math.log10(EPS_LOWER_LIMIT); - for ( int k = 1; k <= an; k++ ) { - // note this should typically just be - // term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k - // but the 1-E term will just be 1, so we do the following to mitigate this problem - double binom = MathUtils.binomialCoefficient(an,k); - double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k); - double term = binom/k - Math.pow(eps_correction,k); - if ( k % 2 == 0 ) { - coef += term; - } else { - coef -= term; - } - } - - logAC0Cache.put(an,coef); - } - - return logAC0Cache.get(an); - } - - private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) { - double mid = (start + stop)/2; - double size = stop-start; - double fa = calculateAFPosterior(likelihoods,start); - double fb = calculateAFPosterior(likelihoods,mid); - double fc = calculateAFPosterior(likelihoods,stop); - double s = (size/6)*(fa + 4*fc + fb); - double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap); - return h; - } - - private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){ - if ( s == 0 ) - return -300.0; - double c = ( a + b )/2; - double h = b-a; - double d = (a + c)/2; - double e = (c + b)/2; - double fd = calculateAFPosterior(likelihoods, d); - double fe = calculateAFPosterior(likelihoods, e); - double s_l = (h/12)*(fa + 4*fd + fc); - double s_r = (h/12)*(fc + 4*fe + fb); - double s_2 = s_l + s_r; - if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){ - return Math.log10(s_2 + (s_2 - s)/15.0); - } - - return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 0eb35d299..a04aef77b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -137,6 +137,7 @@ public class UnifiedGenotyper extends LocusWalker posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[][] log10AlleleFrequencyPriorsSNPs; - private final double[][] log10AlleleFrequencyPriorsIndels; + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; // the priors object private final GenotypePriors genotypePriorsSNPs; @@ -128,8 +128,8 @@ public class UnifiedGenotyperEngine { this.annotationEngine = engine; N = 2 * this.samples.size(); - log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; - log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + log10AlleleFrequencyPriorsSNPs = new double[N+1]; + log10AlleleFrequencyPriorsIndels = new double[N+1]; computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); @@ -265,8 +265,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); - posteriorsArray.set(new double[N + 2]); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); + posteriorsArray.set(new double[2]); } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); @@ -279,9 +279,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? @@ -296,12 +294,11 @@ public class UnifiedGenotyperEngine { // the genotyping model may have stripped it out if ( indexOfAllele == -1 ) continue; - - int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); - // if the most likely AC is not 0, then this is a good alternate allele to use; - // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array - if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1]; + + // if the most likely AC is not 0, then this is a good alternate allele to use + if ( indexOfBestAC != 0 ) { myAlleles.add(alternateAllele); bestGuessIsRef = false; } @@ -312,7 +309,6 @@ public class UnifiedGenotyperEngine { } // calculate p(f>0): - // because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); final double PofF = 1.0 - normalizedPosteriors[0]; @@ -320,18 +316,11 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; + phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - double sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; - if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += AFresult.log10AlleleFrequencyPosteriors[0][i]; - } + final double sum = AFresult.getLog10PosteriorMatrixSum(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } @@ -360,7 +349,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); + printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -374,29 +363,27 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List alternateAllelesToUse = builder.make().getAlternateAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; - double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; - double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -433,9 +420,9 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; - System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); + public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { + normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); + normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum(); return MathUtils.normalizeFromLog10(normalizedPosteriors); } @@ -495,14 +482,6 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - protected static void clearAFarray(double[][] AFs) { - for ( int i = 0; i < AFs.length; i++ ) { - for ( int j = 0; j < AFs[i].length; j++ ) { - AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } - } - private final static double[] binomialProbabilityDepthCache = new double[10000]; static { for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { @@ -547,7 +526,7 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); } - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) { + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { Allele refAllele = null, altAllele = null; for ( Allele allele : vc.getAlleles() ) { if ( allele.isReference() ) @@ -570,11 +549,8 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) - AFline.append("0.00000000\t"); - else - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); - AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE())); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP())); verboseWriter.println(AFline.toString()); } @@ -638,25 +614,22 @@ public class UnifiedGenotyperEngine { return null; } - protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { + protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { - // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i - for (int alleles = 1; alleles <= priors.length; alleles++) { - double sum = 0.0; + double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - double value = Math.pow(theta, alleles) / (double)i; - priors[alleles-1][i] = Math.log10(value); - sum += value; - } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[alleles-1][0] = Math.log10(1.0 - sum); + // for each i + for (int i = 1; i <= N; i++) { + final double value = theta / (double)i; + priors[i] = Math.log10(value); + sum += value; } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); } - protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { switch( model ) { case SNP: return log10AlleleFrequencyPriorsSNPs; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index ff3fe6506..3e48520a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.HashMap; -import java.util.List; -import java.util.Map; import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { - Map numAllelePriorMatrix = new HashMap(); + double[] flatPriors = null; double referenceLikelihood; public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); @@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector { // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? - double[][] flatPrior = createFlatPrior(vc.getAlleles()); - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + if ( flatPriors == null ) { + flatPriors = new double[1+2*samples.size()]; + } + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); // do we want to let this qual go up or down? if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { return true; @@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector { return false; } - - private double[][] createFlatPrior(List alleles) { - if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { - numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); - } - - return numAllelePriorMatrix.get(alleles.size()); - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index c7d196b53..31c7a4e83 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { static double[] AA1, AB1, BB1; static double[] AA2, AB2, AC2, BB2, BC2, CC2; static final int numSamples = 3; - static double[][] priors = new double[2][2*numSamples+1]; // flat priors + static double[] priors = new double[2*numSamples+1]; // flat priors @BeforeSuite public void before() { @@ -83,26 +82,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); - for ( int i = 0; i < 2; i++ ) { - for ( int j = 0; j < 2*numSamples+1; j++ ) { - result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) { - Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero); - } else { - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 67cd40eea..216406b63 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232")); + Arrays.asList("ac3737b4212f634a03c640c83f670955")); executeTest("test MultiSample Pilot1", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b")); + Arrays.asList("6f70dfbaf3bb70c702f9e9dbacd67c17")); executeTest("test Multiple SNP alleles", spec); } @@ -138,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); + Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57")); executeTest("test SLOD", spec); } @@ -146,8 +146,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); - e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" ); + e.put( "--output_mode EMIT_ALL_SITES", "119c9fcefbc69e0fc10b1dc52f6438e3" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -300,13 +300,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("52340d578a708fa709b69ce48987bc9d")); + Arrays.asList("fbc48d7d9e622c9af7922f91bc858151")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("9566c7abef5ee5829a516d90445b347f")); + Arrays.asList("94c52ef70e44709ccd947d32e9c27da9")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } From 913c8b231fddeebcda205e9c88489b3f1876a1ea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 27 Mar 2012 10:34:56 -0400 Subject: [PATCH 07/14] Fix ErrorRatePerCycle to overload equals and hashcode -- Fixes failing integration tests --- .../diagnostics/ErrorRatePerCycle.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index e7a2f74e2..10ac523e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -91,6 +91,25 @@ public class ErrorRatePerCycle extends LocusWalker { this.cycle = cycle; } + // Must overload hashCode and equals to properly work with GATKReportColumn + @Override + public int hashCode() { + return readGroup.hashCode() + 33 * cycle; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final TableKey oKey = (TableKey) o; + + if ( cycle != oKey.cycle ) return false; + if ( !readGroup.equals(oKey.readGroup) ) return false; + + return true; + } + @Override public int compareTo(final TableKey tableKey) { final int scmp = readGroup.compareTo(tableKey.readGroup); From 1f5f737c8bcf640edcc8db2b6d9827ecfdcedf04 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 27 Mar 2012 11:54:35 -0400 Subject: [PATCH 09/14] Optimizing the GATKReportTable.write -- Better iteration, caching of strings, better printf calls, to improve the writing performance of GATKReportTables --- .../sting/gatk/report/GATKReportTable.java | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 62c36ca6c..e0e3ad1fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -756,10 +756,6 @@ public class GATKReportTable { */ // Get the column widths for everything - HashMap columnFormats = new HashMap(); - for (String columnName : columns.keySet()) { - columnFormats.put(columnName, columns.get(columnName).getColumnFormat()); - } String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; // Emit the table definition @@ -787,7 +783,7 @@ public class GATKReportTable { if (needsPadding) { out.printf(" "); } - out.printf(columnFormats.get(columnName).getNameFormat(), columnName); + out.printf(columns.get(columnName).getColumnFormat().getNameFormat(), columnName); needsPadding = true; } @@ -796,29 +792,31 @@ public class GATKReportTable { out.printf("%n"); // Emit the table body - for (Object primaryKey : primaryKeyColumn) { + for (final Object primaryKey : primaryKeyColumn) { needsPadding = false; if (primaryKeyDisplay) { out.printf(primaryKeyFormat, primaryKey); needsPadding = true; } - for (String columnName : columns.keySet()) { - if (columns.get(columnName).isDisplayable()) { + for (final Map.Entry entry : columns.entrySet()) { + final GATKReportColumn column = entry.getValue(); + if (column.isDisplayable()) { if (needsPadding) { - out.printf(" "); + out.print(" "); } - String value = columns.get(columnName).getStringValue(primaryKey); - out.printf(columnFormats.get(columnName).getValueFormat(), value); + + final String value = column.getStringValue(primaryKey); + out.printf(column.getColumnFormat().getValueFormat(), value); needsPadding = true; } } - out.printf("%n"); + out.println(); } - out.printf("%n"); + out.println(); } public int getNumRows() { From 679bb03014c91b6b29f877b1b3bdd0478fa6d418 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 27 Mar 2012 11:54:58 -0400 Subject: [PATCH 10/14] Simple utility function for converting an Iterable to Collection --- public/java/src/org/broadinstitute/sting/utils/Utils.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index a824fefab..130a7fa2f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -703,4 +703,11 @@ public class Utils { return programRecord; } + public static Collection makeCollection(Iterable iter) { + Collection list = new ArrayList(); + for (E item : iter) { + list.add(item); + } + return list; + } } From a638996fe24e547efc54d24577d184c6c1e059f0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 27 Mar 2012 11:56:24 -0400 Subject: [PATCH 11/14] Cleanup of VariantEval, diatribe about performance problems with StateKey -- Minor refactoring of state key iteration in VEW.map to make the dependencies more clear -- Long discussion about the performance problems with StateKey, and how to fix it, which I have run out of time to address before ESP meeting. --- .../varianteval/VariantEvalWalker.java | 80 ++++++++++++++++--- .../walkers/varianteval/util/StateKey.java | 37 ++++++++- 2 files changed, 104 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 4bfd90eac..ebd2500fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker implements Tr // find the comp final VariantContext comp = findMatchingComp(eval, compSet); - HashMap> stateMap = new HashMap>(); - for ( VariantStratifier vs : stratificationObjects ) { - List states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); - stateMap.put(vs, states); - } - - ArrayList stateKeys = new ArrayList(); - variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); - - HashSet stateKeysHash = new HashSet(stateKeys); - - for ( StateKey stateKey : stateKeysHash ) { + for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) { NewEvaluationContext nec = evaluationContexts.get(stateKey); // eval against the comp @@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } +// private Iterable getApplicableStates(final RefMetaDataTracker tracker, +// final ReferenceContext ref, +// final VariantContext eval, +// final String evalName, +// final VariantContext comp, +// final String compName, +// final String sampleName ) { +// Set oldKeys = new HashSet(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName))); +// +// int n = 0; +// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) { +// n++; +// if ( ! oldKeys.contains(newKey) ) +// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm"); +// } +// +// if ( n != oldKeys.size() ) +// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size()); +// +// return oldKeys; +// } + +// private Iterable getApplicableStatesNew(final RefMetaDataTracker tracker, +// final ReferenceContext ref, +// final VariantContext eval, +// final String evalName, +// final VariantContext comp, +// final String compName, +// final String sampleName ) { +// // todo -- implement optimized version +// } + + /** + * Given specific eval and comp VCs and the sample name, return an iterable + * over all of the applicable state keys. + * + * See header of StateKey for performance problems... + * + * @param tracker + * @param ref + * @param eval + * @param evalName + * @param comp + * @param compName + * @param sampleName + * @return + */ + private Iterable getApplicableStates(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final VariantContext eval, + final String evalName, + final VariantContext comp, + final String compName, + final String sampleName ) { + final HashMap> stateMap = new HashMap>(stratificationObjects.size()); + for ( final VariantStratifier vs : stratificationObjects ) { + List states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName); + stateMap.put(vs, states); + } + + ArrayList stateKeys = new ArrayList(); + variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); + + return new HashSet(stateKeys); + } + + @Requires({"comp != null", "evals != null"}) private boolean compHasMatchingEval(final VariantContext comp, final Collection evals) { // find all of the matching comps diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 36b09300b..f62de17a5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -7,7 +7,42 @@ import java.util.TreeMap; * A final constant class representing the specific state configuration * for a VariantEvaluator instance. * - * TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. + * The way this is currently implemented is by a map from the name of a VariantStratification to a + * specific state string. For example, the stratification Novelty has states all, known, novel. A + * specific variant and comp would be tagged as "known" by the stratification, and this could be + * represented here by the map (Novelty -> known). + * + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * + * I've been staring at this state key code for a while. It's just not right, and expensive to boot. + * Here are my thoughts for future work. The state key is both a key with specific state values for + * every stratification. For example, (known, sample1, ac=1). This capability is used in some places, + * such as below, to return a set of all states that should be updated given the eval and comp + * VCs. In principle there are a finite set of such combinations (the product of all states for all active + * stratifications at initialization). We could represent such keys as integers into the set of all combinations. + * + * Note that all of the code that manipulates these things is just terrible. It's all string manipulation and + * HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with + * all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead + * we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from + * specific strat state to this key. + * + * The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in + * the initializeStateKey function. Instead, we should have code that looks like: + * + * init: + * allStates <- initializeCombinationalStateSpace + * + * map: + * for each eval / comp pair: + * for each relevantState based on eval / comp: + * allStates[relevantState].update(eval, comp) + * + * */ public final class StateKey { /** High-performance cache of the toString operation for a constant class */ From c112e0824ac68d8f4c72029db96bd5cc2dc40188 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Mar 2012 11:09:03 -0500 Subject: [PATCH 12/14] I was adding verbose output to the Pileup output for a one-off and decided that I might as well commit it as an option. Updated deprecated calls while I was in there. --- .../sting/gatk/walkers/PileupWalker.java | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index 4d8be4800..0c2b3e349 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -68,6 +69,9 @@ public class PileupWalker extends LocusWalker implements TreeR @Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events") public boolean SHOW_INDEL_PILEUPS = false; + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output") + public boolean SHOW_VERBOSE = false; + @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); @@ -82,7 +86,10 @@ public class PileupWalker extends LocusWalker implements TreeR if ( context.hasBasePileup() ) { ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s%n", basePileup.getPileupString(ref.getBaseAsChar()), rods); + out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); + if ( SHOW_VERBOSE ) + out.printf(" %s", createVerboseOutput(basePileup)); + out.println(); } if ( context.hasExtendedEventPileup() ) { @@ -125,6 +132,24 @@ public class PileupWalker extends LocusWalker implements TreeR return rodString; } + + private static String createVerboseOutput(final ReadBackedPileup pileup) { + final StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + + for ( PileupElement p : pileup ) { + if ( isFirst ) + isFirst = false; + else + sb.append(","); + sb.append(p.getRead().getReadName()); + sb.append(":"); + sb.append(p.getOffset()); + sb.append(":"); + sb.append(p.getRead().getReadLength()); + } + return sb.toString(); + } @Override public void onTraversalDone(Integer result) { From 5dbd3625cd67a90d541375aa4d38c5a944078718 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 27 Mar 2012 13:38:52 -0400 Subject: [PATCH 13/14] Initial algorithm for choosing best alternate haplotypes to genotype based on the likelihoods from all samples instead of choosing for each sample independently. Simple tradeoff of penalty for increasing model complexity and likelihood of the data. --- .../src/org/broadinstitute/sting/utils/Haplotype.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 143fdf4bf..1820ddbc9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -182,7 +182,6 @@ public class Haplotype { public static LinkedHashMap makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize, final int numPrefBases) { - LinkedHashMap haplotypeMap = new LinkedHashMap(); Allele refAllele = null; @@ -215,13 +214,13 @@ public class Haplotype { // Create location for all haplotypes - int startLoc = ref.getWindow().getStart() + startIdxInReference; - int stopLoc = startLoc + haplotypeSize-1; + final int startLoc = ref.getWindow().getStart() + startIdxInReference; + final int stopLoc = startLoc + haplotypeSize-1; - GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); + final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); - for (Allele a : alleleList) { + for (final Allele a : alleleList) { byte[] alleleBases = a.getBases(); // use string concatenation @@ -315,5 +314,4 @@ public class Haplotype { return (fallsInsideDeletion ? -1 : readBases); } - } From ea9c04b8c24feeb5d37a97bdd26275cfaf7f580c Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Tue, 27 Mar 2012 14:32:02 -0400 Subject: [PATCH 14/14] Updated license year --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 634096e2b..648ec8fc3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011 The Broad Institute +Copyright (c) 2012 The Broad Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation