diff --git a/LICENSE b/LICENSE index 634096e2b..648ec8fc3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011 The Broad Institute +Copyright (c) 2012 The Broad Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 9a7c4ced0..2b611109f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.report; import org.apache.commons.lang.math.NumberUtils; -import java.util.Arrays; -import java.util.Collection; -import java.util.TreeMap; +import java.util.*; /** * Holds values for a column in a GATK report table */ -public class GATKReportColumn extends TreeMap { +public class GATKReportColumn extends LinkedHashMap { final private String columnName; final private Object defaultValue; final private String format; diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 62c36ca6c..e0e3ad1fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -756,10 +756,6 @@ public class GATKReportTable { */ // Get the column widths for everything - HashMap columnFormats = new HashMap(); - for (String columnName : columns.keySet()) { - columnFormats.put(columnName, columns.get(columnName).getColumnFormat()); - } String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; // Emit the table definition @@ -787,7 +783,7 @@ public class GATKReportTable { if (needsPadding) { out.printf(" "); } - out.printf(columnFormats.get(columnName).getNameFormat(), columnName); + out.printf(columns.get(columnName).getColumnFormat().getNameFormat(), columnName); needsPadding = true; } @@ -796,29 +792,31 @@ public class GATKReportTable { out.printf("%n"); // Emit the table body - for (Object primaryKey : primaryKeyColumn) { + for (final Object primaryKey : primaryKeyColumn) { needsPadding = false; if (primaryKeyDisplay) { out.printf(primaryKeyFormat, primaryKey); needsPadding = true; } - for (String columnName : columns.keySet()) { - if (columns.get(columnName).isDisplayable()) { + for (final Map.Entry entry : columns.entrySet()) { + final GATKReportColumn column = entry.getValue(); + if (column.isDisplayable()) { if (needsPadding) { - out.printf(" "); + out.print(" "); } - String value = columns.get(columnName).getStringValue(primaryKey); - out.printf(columnFormats.get(columnName).getValueFormat(), value); + + final String value = column.getStringValue(primaryKey); + out.printf(column.getColumnFormat().getValueFormat(), value); needsPadding = true; } } - out.printf("%n"); + out.println(); } - out.printf("%n"); + out.println(); } public int getNumRows() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index 4d8be4800..0c2b3e349 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -68,6 +69,9 @@ public class PileupWalker extends LocusWalker implements TreeR @Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events") public boolean SHOW_INDEL_PILEUPS = false; + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output") + public boolean SHOW_VERBOSE = false; + @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); @@ -82,7 +86,10 @@ public class PileupWalker extends LocusWalker implements TreeR if ( context.hasBasePileup() ) { ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s%n", basePileup.getPileupString(ref.getBaseAsChar()), rods); + out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); + if ( SHOW_VERBOSE ) + out.printf(" %s", createVerboseOutput(basePileup)); + out.println(); } if ( context.hasExtendedEventPileup() ) { @@ -125,6 +132,24 @@ public class PileupWalker extends LocusWalker implements TreeR return rodString; } + + private static String createVerboseOutput(final ReadBackedPileup pileup) { + final StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + + for ( PileupElement p : pileup ) { + if ( isFirst ) + isFirst = false; + else + sb.append(","); + sb.append(p.getRead().getReadName()); + sb.append(":"); + sb.append(p.getOffset()); + sb.append(":"); + sb.append(p.getRead().getReadLength()); + } + return sb.toString(); + } @Override public void onTraversalDone(Integer result) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index e7a2f74e2..10ac523e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -91,6 +91,25 @@ public class ErrorRatePerCycle extends LocusWalker { this.cycle = cycle; } + // Must overload hashCode and equals to properly work with GATKReportColumn + @Override + public int hashCode() { + return readGroup.hashCode() + 33 * cycle; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final TableKey oKey = (TableKey) o; + + if ( cycle != oKey.cycle ) return false; + if ( !readGroup.equals(oKey.readGroup) ) return false; + + return true; + } + @Override public int compareTo(final TableKey tableKey) { final int scmp = readGroup.compareTo(tableKey.readGroup); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 988b6d1ed..23d7c0ad6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -70,6 +70,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @return the alleles used for genotyping */ protected abstract List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 9c4af8512..0867d949e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -25,6 +25,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.ArrayList; +import java.util.Arrays; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -34,23 +39,54 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; */ public class AlleleFrequencyCalculationResult { - // IMPORTANT NOTE: - // These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N). - // For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that - // the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may - // be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0). - // In the bi-allelic case (where there are no other alternate alleles over which to marginalize), - // the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED. - final double[][] log10AlleleFrequencyLikelihoods; - final double[][] log10AlleleFrequencyPosteriors; + // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles + private double log10MLE; + private double log10MAP; + final private int[] alleleCountsOfMLE; + final private int[] alleleCountsOfMAP; - // These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - double log10LikelihoodOfAFzero = 0.0; - double log10PosteriorOfAFzero = 0.0; + // The posteriors seen, not including that of AF=0 + // TODO -- better implementation needed here (see below) + private ArrayList log10PosteriorMatrixValues = new ArrayList(100000); + private Double log10PosteriorMatrixSum = null; - public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { - log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; - log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + private double log10LikelihoodOfAFzero; + private double log10PosteriorOfAFzero; + + + public AlleleFrequencyCalculationResult(final int maxAltAlleles) { + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + reset(); + } + + public double getLog10MLE() { + return log10MLE; + } + + public double getLog10MAP() { + return log10MAP; + } + + public double getLog10PosteriorMatrixSum() { + if ( log10PosteriorMatrixSum == null ) { + // TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory; + // TODO -- will discuss with the team what the best option is + final double[] tmp = new double[log10PosteriorMatrixValues.size()]; + for ( int i = 0; i < tmp.length; i++ ) + tmp[i] = log10PosteriorMatrixValues.get(i); + log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp); + } + return log10PosteriorMatrixSum; + } + + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; } public double getLog10LikelihoodOfAFzero() { @@ -60,4 +96,47 @@ public class AlleleFrequencyCalculationResult { public double getLog10PosteriorOfAFzero() { return log10PosteriorOfAFzero; } + + public void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { + alleleCountsOfMLE[i] = 0; + alleleCountsOfMAP[i] = 0; + } + log10PosteriorMatrixValues.clear(); + log10PosteriorMatrixSum = null; + } + + public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMLE[i] = alleleCountsForK[i]; + } + } + + public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + log10PosteriorMatrixValues.add(log10LofK); + if ( log10LofK > log10MAP ) { + log10MAP = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMAP[i] = alleleCountsForK[i]; + } + } + + public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..3f03c2bb2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + private final GenomeLocParser locParser; + + public ConsensusAlleleCounter(final GenomeLocParser locParser, + final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + this.locParser = locParser; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); +// logger.info("Alleles at " + ref.getLocus()); +// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) { +// logger.info(" " + elt.getValue() + " => " + elt.getKey()); +// } + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for (Map.Entry sample : contexts.entrySet()) { + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) { +// if ( nIndelReads > 0 ) +// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + continue; +// } else { +// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + } + + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + +/* if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + */ + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index d18f7e5ed..00d90e3f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,11 +52,9 @@ import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int HAPLOTYPE_SIZE; - private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; @@ -72,217 +70,30 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup private GenomeLoc lastSiteVisited; - private ArrayList alleleList; + private List alleleList = new ArrayList(); static { indelLikelihoodMap.set(new HashMap>()); } - public IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - - public static ArrayList computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser, - int minIndelCountForGenotyping, boolean doMultiAllelicCalls) { - Allele refAllele = null, altAllele = null; - GenomeLoc loc = ref.getLocus(); - ArrayList aList = new ArrayList(); - - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - insCount += indelPileup.getNumberOfInsertions(); - delCount += indelPileup.getNumberOfDeletions(); - } - - if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) - return aList; - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { - //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - -/* if (DEBUG && p.isIndel()) { - System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", - read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), - p.getEventLength(),p.getType().toString(), p.getEventBases()); - } - */ - - String indelString = p.getEventBases(); - if (p.isInsertion()) { - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (String s : consensusIndelStrings.keySet()) { - cList.add(new Pair(s,consensusIndelStrings.get(s))); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(indelString)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(indelString)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - cList.add(new Pair(indelString,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); - - } - } - - } - - Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - String bestAltAllele = ""; - - for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s), stop = 0; - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); - - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - if (Allele.acceptableAlleleBases(s)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(s, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - ArrayList vcAlleles = new ArrayList(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(ref.getBase()); - builder.noGenotypes(); - if (doMultiAllelicCalls) - vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - - } - } - - if (vcs.isEmpty()) - return aList; // nothing else to do, no alleles passed minimum count criterion - - VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); - - aList = new ArrayList(mergedVC.getAlleles()); - - return aList; - + private List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenomeLocParser locParser) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); @@ -338,7 +149,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } else { - alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser, minIndelCountForGenotyping,doMultiAllelicCalls); + alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser); if (alleleList.isEmpty()) return null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java deleted file mode 100755 index 99d55bc69..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.commons.lang.NotImplementedException; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.security.cert.CertificateNotYetValidException; -import java.util.*; - -import org.broadinstitute.sting.utils.codecs.vcf.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 8/30/11 - * Time: 10:08 AM - * To change this template use File | Settings | File Templates. - */ -public class UGBoundAF extends RodWalker { - - @Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true) - VCFWriter writer; - - @Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true) - List> variants; - - private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0); - - private HashMap> epsilonPosteriorCache = new HashMap>(8192); - private HashMap logAC0Cache = new HashMap(8192); - private int QUANTIZATION_FACTOR = 1000; - - - public void initialize() { - Set allHeaderLines = new HashSet(1024); - for ( RodBinding v : variants ) { - String trackName = v.getName(); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set headerLines = new HashSet(vcfHeaders.get(trackName).getMetaData()); - } - allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+ - "frequency. First value=95% probability AF>x. Second value=95% probability AF allVariants = tracker.getValues(variants); - if ( allVariants.size() == 0 ) { - return null; - } - - List alternateAlleles = getAllAlternateAlleles(allVariants); - VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet())); - if ( alternateAlleles.size() > 1 ) { - logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString()); - alternateAlleles.add(allVariants.get(0).getReference()); - builder.alleles(alternateAlleles); - builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles))); - } else { - // get all the genotype likelihoods - GenotypesContext context = GenotypesContext.create(); - int numNoCall = 0; - for ( VariantContext v : allVariants ) { - numNoCall += v.getNoCallCount(); - context.addAll(v.getGenotypes()); - } - builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context))); - } - - return builder.make(); - } - - private List getAllAlternateAlleles(List variants) { - List alleles = new ArrayList(3); // some overhead - for ( VariantContext v : variants ) { - alleles.addAll(v.getAlternateAlleles()); - } - return alleles; - } - - @Override - public Integer reduce(VariantContext value, Integer sum) { - if ( value == null ) - return sum; - writer.add(value); - return ++sum; - } - - private int N_ITERATIONS = 1; - private double[] getACPosteriors(GenotypesContext gc) { - // note this uses uniform priors (!) - - double[][] zeroPriors = new double[1][1+2*gc.size()]; - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size()); - // todo -- allow multiple alleles here - for ( int i = 0; i < N_ITERATIONS; i ++ ) { - ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false); - } - - return result.log10AlleleFrequencyPosteriors[0]; - } - - private String boundAlleleFrequency(double[] ACLikelihoods) { - // note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs - // thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be - // agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over - - // want to pick a lower limit x and upper limit y such that - // int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95 - // int_{f=x to y} calculateAFPosterior(f) df = 0.95 - // and that (y-x) is minimized - - // this is done by quantizing [0,1] into small bins and, since the distribution is - // unimodal, greedily adding them until the probability is >= 0.95 - - throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented")); - } - - private double calculateAFPosterior(double[] likelihoods, double af) { - double[] probLiks = new double[likelihoods.length]; - for ( int c = 0; c < likelihoods.length; c++) { - probLiks[c] = calculateAFPosterior(c,likelihoods.length,af); - } - - return MathUtils.log10sumLog10(probLiks); - } - - private double calculateAFPosterior(int ac, int n, double af) { - // evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes - switch ( ac ) { - case 0: - return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af); - case 1: - return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT); - case 2: - return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT); - default: - return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac); - } - } - - private double logAC0Coef(int an) { - if ( ! logAC0Cache.containsKey(an) ) { - double coef = -Math.log10(EPS_LOWER_LIMIT); - for ( int k = 1; k <= an; k++ ) { - // note this should typically just be - // term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k - // but the 1-E term will just be 1, so we do the following to mitigate this problem - double binom = MathUtils.binomialCoefficient(an,k); - double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k); - double term = binom/k - Math.pow(eps_correction,k); - if ( k % 2 == 0 ) { - coef += term; - } else { - coef -= term; - } - } - - logAC0Cache.put(an,coef); - } - - return logAC0Cache.get(an); - } - - private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) { - double mid = (start + stop)/2; - double size = stop-start; - double fa = calculateAFPosterior(likelihoods,start); - double fb = calculateAFPosterior(likelihoods,mid); - double fc = calculateAFPosterior(likelihoods,stop); - double s = (size/6)*(fa + 4*fc + fb); - double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap); - return h; - } - - private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){ - if ( s == 0 ) - return -300.0; - double c = ( a + b )/2; - double h = b-a; - double d = (a + c)/2; - double e = (c + b)/2; - double fd = calculateAFPosterior(likelihoods, d); - double fe = calculateAFPosterior(likelihoods, e); - double s_l = (h/12)*(fa + 4*fd + fc); - double s_r = (h/12)*(fc + 4*fe + fb); - double s_2 = s_l + s_r; - if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){ - return Math.log10(s_2 + (s_2 - s)/15.0); - } - - return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 823eafc46..58b8af493 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -118,6 +118,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute + * to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end + * up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for + * low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over + * come this threshold. + */ + @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) + public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; + + /** * This argument informs the prior probability of having an indel at a site. */ @@ -165,6 +176,7 @@ public class UnifiedArgumentCollection { uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; + uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9a6b3b1dd..bf482f5d7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -137,6 +137,7 @@ public class UnifiedGenotyper extends LocusWalker posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[][] log10AlleleFrequencyPriorsSNPs; - private final double[][] log10AlleleFrequencyPriorsIndels; + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; // the priors object private final GenotypePriors genotypePriorsSNPs; @@ -131,9 +131,8 @@ public class UnifiedGenotyperEngine { this.annotationEngine = engine; this.N = N; - - log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; - log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + log10AlleleFrequencyPriorsSNPs = new double[N+1]; + log10AlleleFrequencyPriorsIndels = new double[N+1]; computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); @@ -269,8 +268,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); - posteriorsArray.set(new double[N + 2]); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); + posteriorsArray.set(new double[2]); } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); @@ -283,9 +282,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? @@ -300,12 +297,11 @@ public class UnifiedGenotyperEngine { // the genotyping model may have stripped it out if ( indexOfAllele == -1 ) continue; - - int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); - // if the most likely AC is not 0, then this is a good alternate allele to use; - // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array - if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1]; + + // if the most likely AC is not 0, then this is a good alternate allele to use + if ( indexOfBestAC != 0 ) { myAlleles.add(alternateAllele); bestGuessIsRef = false; } @@ -316,7 +312,6 @@ public class UnifiedGenotyperEngine { } // calculate p(f>0): - // because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); final double PofF = 1.0 - normalizedPosteriors[0]; @@ -324,18 +319,11 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; + phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - double sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; - if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += AFresult.log10AlleleFrequencyPosteriors[0][i]; - } + final double sum = AFresult.getLog10PosteriorMatrixSum(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } @@ -364,7 +352,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); + printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -378,29 +366,27 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List alternateAllelesToUse = builder.make().getAlternateAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; - double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; - double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -437,9 +423,9 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; - System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); + public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { + normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); + normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum(); return MathUtils.normalizeFromLog10(normalizedPosteriors); } @@ -499,14 +485,6 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - protected static void clearAFarray(double[][] AFs) { - for ( int i = 0; i < AFs.length; i++ ) { - for ( int j = 0; j < AFs[i].length; j++ ) { - AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } - } - private final static double[] binomialProbabilityDepthCache = new double[10000]; static { for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { @@ -551,7 +529,7 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); } - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) { + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { Allele refAllele = null, altAllele = null; for ( Allele allele : vc.getAlleles() ) { if ( allele.isReference() ) @@ -574,11 +552,8 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) - AFline.append("0.00000000\t"); - else - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); - AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE())); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP())); verboseWriter.println(AFline.toString()); } @@ -648,31 +623,29 @@ public class UnifiedGenotyperEngine { return null; } - protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { + protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { - // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i - for (int alleles = 1; alleles <= priors.length; alleles++) { - double sum = 0.0; + double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - double value = Math.pow(theta, alleles) / (double)i; - priors[alleles-1][i] = Math.log10(value); - sum += value; - } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[alleles-1][0] = Math.log10(1.0 - sum); + // for each i + for (int i = 1; i <= N; i++) { + final double value = theta / (double)i; + priors[i] = Math.log10(value); + sum += value; } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); } - protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { if (model.name().toUpperCase().contains("SNP")) return log10AlleleFrequencyPriorsSNPs; else if (model.name().toUpperCase().contains("INDEL")) return log10AlleleFrequencyPriorsIndels; else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + } private static GenotypePriors createGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index ff3fe6506..3e48520a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.HashMap; -import java.util.List; -import java.util.Map; import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { - Map numAllelePriorMatrix = new HashMap(); + double[] flatPriors = null; double referenceLikelihood; public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); @@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector { // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? - double[][] flatPrior = createFlatPrior(vc.getAlleles()); - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + if ( flatPriors == null ) { + flatPriors = new double[1+2*samples.size()]; + } + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); // do we want to let this qual go up or down? if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { return true; @@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector { return false; } - - private double[][] createFlatPrior(List alleles) { - if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { - numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); - } - - return numAllelePriorMatrix.get(alleles.size()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 2b9f159ac..ebd2500fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker implements Tr // find the comp final VariantContext comp = findMatchingComp(eval, compSet); - HashMap> stateMap = new HashMap>(); - for ( VariantStratifier vs : stratificationObjects ) { - List states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); - stateMap.put(vs, states); - } - - ArrayList stateKeys = new ArrayList(); - variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); - - HashSet stateKeysHash = new HashSet(stateKeys); - - for ( StateKey stateKey : stateKeysHash ) { + for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) { NewEvaluationContext nec = evaluationContexts.get(stateKey); // eval against the comp @@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } +// private Iterable getApplicableStates(final RefMetaDataTracker tracker, +// final ReferenceContext ref, +// final VariantContext eval, +// final String evalName, +// final VariantContext comp, +// final String compName, +// final String sampleName ) { +// Set oldKeys = new HashSet(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName))); +// +// int n = 0; +// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) { +// n++; +// if ( ! oldKeys.contains(newKey) ) +// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm"); +// } +// +// if ( n != oldKeys.size() ) +// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size()); +// +// return oldKeys; +// } + +// private Iterable getApplicableStatesNew(final RefMetaDataTracker tracker, +// final ReferenceContext ref, +// final VariantContext eval, +// final String evalName, +// final VariantContext comp, +// final String compName, +// final String sampleName ) { +// // todo -- implement optimized version +// } + + /** + * Given specific eval and comp VCs and the sample name, return an iterable + * over all of the applicable state keys. + * + * See header of StateKey for performance problems... + * + * @param tracker + * @param ref + * @param eval + * @param evalName + * @param comp + * @param compName + * @param sampleName + * @return + */ + private Iterable getApplicableStates(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final VariantContext eval, + final String evalName, + final VariantContext comp, + final String compName, + final String sampleName ) { + final HashMap> stateMap = new HashMap>(stratificationObjects.size()); + for ( final VariantStratifier vs : stratificationObjects ) { + List states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName); + stateMap.put(vs, states); + } + + ArrayList stateKeys = new ArrayList(); + variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); + + return new HashSet(stateKeys); + } + + @Requires({"comp != null", "evals != null"}) private boolean compHasMatchingEval(final VariantContext comp, final Collection evals) { // find all of the matching comps @@ -482,11 +538,13 @@ public class VariantEvalWalker extends RodWalker implements Tr public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); - for ( StateKey stateKey : evaluationContexts.keySet() ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); + for ( Map.Entry ecElt : evaluationContexts.entrySet() ) { + final StateKey stateKey = ecElt.getKey(); + final NewEvaluationContext nec = ecElt.getValue(); for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { ve.finalizeEvaluation(); + final String veName = ve.getSimpleName(); // ve.getClass().getSimpleName(); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); Map datamap = scanner.getData(); @@ -498,7 +556,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (field.get(ve) instanceof TableType) { TableType t = (TableType) field.get(ve); - final String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); + final String subTableName = veName + "." + field.getName(); final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; @@ -539,11 +597,10 @@ public class VariantEvalWalker extends RodWalker implements Tr } } } else { - GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); + GATKReportTable table = report.getTable(veName); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getName(); - + final String columnName = vs.getName(); table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index d5cf685de..226429439 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -8,6 +8,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public abstract class VariantEvaluator { private VariantEvalWalker walker; + private final String simpleName; + + protected VariantEvaluator() { + this.simpleName = getClass().getSimpleName(); + } public void initialize(VariantEvalWalker walker) { this.walker = walker; @@ -90,4 +95,8 @@ public abstract class VariantEvaluator { protected static String formattedRatio(final int num, final int denom) { return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); } + + public String getSimpleName() { + return simpleName; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java index db44e9e28..793bafdd0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java @@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.lang.reflect.Field; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -40,6 +41,7 @@ import java.util.Map; * the object, a Mashalling object can serialize or deserialize a analysis module. */ public class AnalysisModuleScanner { + final private static Map annotationCache = new HashMap(); // what we extracted from the class private Map datums = new LinkedHashMap(); // the data we've discovered @@ -84,12 +86,22 @@ public class AnalysisModuleScanner { // get the fields from the class, and extract for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) { for (Field f : superCls.getDeclaredFields()) - for (Annotation annotation : f.getAnnotations()) { + for (Annotation annotation : getAnnotations(f)) { if (annotation.annotationType().equals(DataPoint.class)) datums.put(f,(DataPoint) annotation); } } } + + private Annotation[] getAnnotations(final Field field) { + final String fieldName = field.toString(); + Annotation[] annotations = annotationCache.get(fieldName); + if ( annotations == null ) { + annotations = field.getAnnotations(); + annotationCache.put(fieldName, annotations); + } + return annotations; + } /** * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java index 09f2c0168..b5c6a1ecf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java @@ -36,6 +36,16 @@ public class NewEvaluationContext extends HashMap { return new TreeMap(evaluationInstances); } + public StateKey makeStateKey() { + Map map = new HashMap(size()); + + for (Map.Entry elt : this.entrySet() ) { + map.put(elt.getKey().getName(), elt.getValue()); + } + + return new StateKey(map); + } + public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { for ( final VariantEvaluator evaluation : evaluationInstances.values() ) { // the other updateN methods don't see a null context diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 96bd9a9b7..f62de17a5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -3,25 +3,102 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; import java.util.Map; import java.util.TreeMap; -public class StateKey extends TreeMap { -// public int hashCode() { -// int hashCode = 1; -// -// for (final Map.Entry pair : this.entrySet()) { -// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); -// } -// -// return hashCode; -// } +/** + * A final constant class representing the specific state configuration + * for a VariantEvaluator instance. + * + * The way this is currently implemented is by a map from the name of a VariantStratification to a + * specific state string. For example, the stratification Novelty has states all, known, novel. A + * specific variant and comp would be tagged as "known" by the stratification, and this could be + * represented here by the map (Novelty -> known). + * + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * + * I've been staring at this state key code for a while. It's just not right, and expensive to boot. + * Here are my thoughts for future work. The state key is both a key with specific state values for + * every stratification. For example, (known, sample1, ac=1). This capability is used in some places, + * such as below, to return a set of all states that should be updated given the eval and comp + * VCs. In principle there are a finite set of such combinations (the product of all states for all active + * stratifications at initialization). We could represent such keys as integers into the set of all combinations. + * + * Note that all of the code that manipulates these things is just terrible. It's all string manipulation and + * HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with + * all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead + * we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from + * specific strat state to this key. + * + * The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in + * the initializeStateKey function. Instead, we should have code that looks like: + * + * init: + * allStates <- initializeCombinationalStateSpace + * + * map: + * for each eval / comp pair: + * for each relevantState based on eval / comp: + * allStates[relevantState].update(eval, comp) + * + * + */ +public final class StateKey { + /** High-performance cache of the toString operation for a constant class */ + private final String string; + private final TreeMap states; - public String toString() { - String value = ""; + public StateKey(final Map states) { + this.states = new TreeMap(states); + this.string = formatString(); + } - for ( final String key : this.keySet() ) { - //value += "\tstate " + key + ":" + this.get(key) + "\n"; - value += String.format("%s:%s;", key, this.get(key)); + public StateKey(final StateKey toOverride, final String keyOverride, final String valueOverride) { + if ( toOverride == null ) { + this.states = new TreeMap(); + } else { + this.states = new TreeMap(toOverride.states); } - return value; + this.states.put(keyOverride, valueOverride); + this.string = formatString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final StateKey stateKey = (StateKey) o; + + if (states != null ? !states.equals(stateKey.states) : stateKey.states != null) return false; + + return true; + } + + @Override + public int hashCode() { + return states.hashCode(); + } + + @Override + public String toString() { + return string; + } + + private final String formatString() { + StringBuilder b = new StringBuilder(); + + for ( Map.Entry entry : states.entrySet() ) { + b.append(String.format("%s:%s;", entry.getKey(), entry.getValue())); + } + + return b.toString(); + } + + // TODO -- might be slow because of tree map + public String get(final String key) { + return states.get(key); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 91c7140e6..9b4ae129a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -214,20 +214,9 @@ public class VariantEvalUtils { ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec)); } } else { - HashMap necs = new HashMap(); - - StateKey stateKey = new StateKey(); - for (VariantStratifier vs : ec.keySet()) { - String state = ec.get(vs); - - stateKey.put(vs.getName(), state); - } - + final StateKey stateKey = ec.makeStateKey(); ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); - - necs.put(stateKey, ec); - - return necs; + return new HashMap(Collections.singletonMap(stateKey, ec)); } return ecs; @@ -428,14 +417,8 @@ public class VariantEvalUtils { HashMap> oneSetOfStates = newStateStack.pop(); VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); - for (String state : oneSetOfStates.get(vs)) { - StateKey newStateKey = new StateKey(); - if (stateKey != null) { - newStateKey.putAll(stateKey); - } - - newStateKey.put(vs.getName(), state); - + for (final String state : oneSetOfStates.get(vs)) { + final StateKey newStateKey = new StateKey(stateKey, vs.getName(), state); initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); } } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 143fdf4bf..1820ddbc9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -182,7 +182,6 @@ public class Haplotype { public static LinkedHashMap makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize, final int numPrefBases) { - LinkedHashMap haplotypeMap = new LinkedHashMap(); Allele refAllele = null; @@ -215,13 +214,13 @@ public class Haplotype { // Create location for all haplotypes - int startLoc = ref.getWindow().getStart() + startIdxInReference; - int stopLoc = startLoc + haplotypeSize-1; + final int startLoc = ref.getWindow().getStart() + startIdxInReference; + final int stopLoc = startLoc + haplotypeSize-1; - GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); + final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); - for (Allele a : alleleList) { + for (final Allele a : alleleList) { byte[] alleleBases = a.getBases(); // use string concatenation @@ -315,5 +314,4 @@ public class Haplotype { return (fallsInsideDeletion ? -1 : readBases); } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3a5d859c3..072980c27 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -234,6 +234,9 @@ public class MathUtils { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); + if(maxValue == Double.NEGATIVE_INFINITY) + return sum; + for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -1508,6 +1511,24 @@ public class MathUtils { return result; } + /** Same routine, unboxed types for efficiency + * + * @param x + * @param y + * @return Vector of same length as x and y so that z[k] = x[k]+y[k] + */ + public static double[] vectorSum(double[]x, double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); + + double[] result = new double[x.length]; + for (int k=0; k Double[] scalarTimesVector(E a, E[] v1) { Double result[] = new Double[v1.length]; diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index a824fefab..130a7fa2f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -703,4 +703,11 @@ public class Utils { return programRecord; } + public static Collection makeCollection(Iterable iter) { + Collection list = new ArrayList(); + for (E item : iter) { + list.add(item); + } + return list; + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index c7d196b53..31c7a4e83 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { static double[] AA1, AB1, BB1; static double[] AA2, AB2, AC2, BB2, BC2, CC2; static final int numSamples = 3; - static double[][] priors = new double[2][2*numSamples+1]; // flat priors + static double[] priors = new double[2*numSamples+1]; // flat priors @BeforeSuite public void before() { @@ -83,26 +82,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); - for ( int i = 0; i < 2; i++ ) { - for ( int j = 0; j < 2*numSamples+1; j++ ) { - result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) { - Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero); - } else { - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b3bd0253c..216406b63 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -17,8 +17,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232")); + Arrays.asList("ac3737b4212f634a03c640c83f670955")); executeTest("test MultiSample Pilot1", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); + Arrays.asList("6f70dfbaf3bb70c702f9e9dbacd67c17")); executeTest("test Multiple SNP alleles", spec); } @@ -138,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); + Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57")); executeTest("test SLOD", spec); } @@ -146,8 +146,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); - e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" ); + e.put( "--output_mode EMIT_ALL_SITES", "119c9fcefbc69e0fc10b1dc52f6438e3" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -300,13 +300,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("52340d578a708fa709b69ce48987bc9d")); + Arrays.asList("fbc48d7d9e622c9af7922f91bc858151")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("9566c7abef5ee5829a516d90445b347f")); + Arrays.asList("94c52ef70e44709ccd947d32e9c27da9")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { UserException.class); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("f08ff07ad49d388198c1887baad05977")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a")); + executeTest("test minIndelFraction 1.0", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java new file mode 100644 index 000000000..d7c866a0a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.gatk.walkers.validation; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 3/26/12 + * Time: 3:29 PM + * To change this template use File | Settings | File Templates. + */ +public class ValidationSiteSelectorIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T ValidationSiteSelector -R " + b36KGReference + " -L 1 -o %s -NO_HEADER -numSites 100 " + args; + } + + private static String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + private static String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + private static String samplePrefix = " -sf " + samplesFile; + private static String freqUnif = " --frequencySelectionMode UNIFORM "; + private static String freqAF = " --frequencySelectionMode KEEP_AF_SPECTRUM "; + private static String sampleNone = " -sampleMode NONE "; + private static String sampleGT = samplePrefix+" -sampleMode POLY_BASED_ON_GT "; + private static String sampleGL = samplePrefix+" -sampleMode POLY_BASED_ON_GL -samplePNonref 0.95"; + + + @Test(enabled=true) + public void testNoSampleSelectionFreqUniform() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleNone + freqUnif + "--variant " + testfile), + 1, + Arrays.asList("d49baeb8000a426c172ce1d81eb37963") + ); + + executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec); + } + + @Test(enabled=true) + public void testNoSampleSelectionFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleNone + freqAF + "--variant " + testfile), + 1, + Arrays.asList("0fb0d015d462c34514fc7e96beea5f56") + ); + + executeTest("testNoSampleSelectionFreqAF--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGTFreqUniform() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGT + freqUnif + "--variant " + testfile), + 1, + Arrays.asList("0672854299d42ea8af906976a3849ae6") + ); + + executeTest("testPolyGTFreqUniform--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGTFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGT + freqAF + "--variant " + testfile), + 1, + Arrays.asList("5bdffda1a063d0bddd6b236854ec627d") + ); + + executeTest("testPolyGTFreqAF--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGLFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGL + freqAF + "--variant " + testfile), + 1, + Arrays.asList("35ef16aa41303606a4b94f7b88bd9aa8") + ); + + executeTest("testPolyGLFreqAF--" + testfile, spec); + } + +}