diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 9a7c4ced0..2b611109f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.report; import org.apache.commons.lang.math.NumberUtils; -import java.util.Arrays; -import java.util.Collection; -import java.util.TreeMap; +import java.util.*; /** * Holds values for a column in a GATK report table */ -public class GATKReportColumn extends TreeMap { +public class GATKReportColumn extends LinkedHashMap { final private String columnName; final private Object defaultValue; final private String format; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..3f03c2bb2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + private final GenomeLocParser locParser; + + public ConsensusAlleleCounter(final GenomeLocParser locParser, + final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + this.locParser = locParser; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); +// logger.info("Alleles at " + ref.getLocus()); +// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) { +// logger.info(" " + elt.getValue() + " => " + elt.getKey()); +// } + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for (Map.Entry sample : contexts.entrySet()) { + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) { +// if ( nIndelReads > 0 ) +// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + continue; +// } else { +// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + } + + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + +/* if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + */ + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 21f11d2ff..00d90e3f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,11 +52,9 @@ import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int HAPLOTYPE_SIZE; - private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; @@ -72,7 +70,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup private GenomeLoc lastSiteVisited; - private ArrayList alleleList; + private List alleleList = new ArrayList(); static { indelLikelihoodMap.set(new HashMap>()); @@ -83,205 +81,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - - private ArrayList computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { - Allele refAllele = null, altAllele = null; - GenomeLoc loc = ref.getLocus(); - ArrayList aList = new ArrayList(); - - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - insCount += indelPileup.getNumberOfInsertions(); - delCount += indelPileup.getNumberOfDeletions(); - } - - if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) - return aList; - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { - //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - -/* if (DEBUG && p.isIndel()) { - System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", - read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), - p.getEventLength(),p.getType().toString(), p.getEventBases()); - } - */ - - String indelString = p.getEventBases(); - if (p.isInsertion()) { - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (String s : consensusIndelStrings.keySet()) { - cList.add(new Pair(s,consensusIndelStrings.get(s))); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(indelString)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(indelString)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - cList.add(new Pair(indelString,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); - - } - } - - } - - Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - String bestAltAllele = ""; - - for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s), stop = 0; - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); - - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - if (Allele.acceptableAlleleBases(s)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(s, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - ArrayList vcAlleles = new ArrayList(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(ref.getBase()); - builder.noGenotypes(); - if (doMultiAllelicCalls) - vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - - } - } - - if (vcs.isEmpty()) - return aList; // nothing else to do, no alleles passed minimum count criterion - - VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); - - aList = new ArrayList(mergedVC.getAlleles()); - - return aList; - + private List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenomeLocParser locParser) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 82e411c25..7b8e5c897 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -118,6 +118,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute + * to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end + * up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for + * low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over + * come this threshold. + */ + @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) + public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; + + /** * This argument informs the prior probability of having an indel at a site. */ @@ -165,6 +176,7 @@ public class UnifiedArgumentCollection { uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; + uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 2b9f159ac..4bfd90eac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -482,11 +482,13 @@ public class VariantEvalWalker extends RodWalker implements Tr public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); - for ( StateKey stateKey : evaluationContexts.keySet() ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); + for ( Map.Entry ecElt : evaluationContexts.entrySet() ) { + final StateKey stateKey = ecElt.getKey(); + final NewEvaluationContext nec = ecElt.getValue(); for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { ve.finalizeEvaluation(); + final String veName = ve.getSimpleName(); // ve.getClass().getSimpleName(); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); Map datamap = scanner.getData(); @@ -498,7 +500,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (field.get(ve) instanceof TableType) { TableType t = (TableType) field.get(ve); - final String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); + final String subTableName = veName + "." + field.getName(); final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; @@ -539,11 +541,10 @@ public class VariantEvalWalker extends RodWalker implements Tr } } } else { - GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); + GATKReportTable table = report.getTable(veName); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getName(); - + final String columnName = vs.getName(); table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index d5cf685de..226429439 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -8,6 +8,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public abstract class VariantEvaluator { private VariantEvalWalker walker; + private final String simpleName; + + protected VariantEvaluator() { + this.simpleName = getClass().getSimpleName(); + } public void initialize(VariantEvalWalker walker) { this.walker = walker; @@ -90,4 +95,8 @@ public abstract class VariantEvaluator { protected static String formattedRatio(final int num, final int denom) { return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); } + + public String getSimpleName() { + return simpleName; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java index db44e9e28..793bafdd0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java @@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.lang.reflect.Field; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -40,6 +41,7 @@ import java.util.Map; * the object, a Mashalling object can serialize or deserialize a analysis module. */ public class AnalysisModuleScanner { + final private static Map annotationCache = new HashMap(); // what we extracted from the class private Map datums = new LinkedHashMap(); // the data we've discovered @@ -84,12 +86,22 @@ public class AnalysisModuleScanner { // get the fields from the class, and extract for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) { for (Field f : superCls.getDeclaredFields()) - for (Annotation annotation : f.getAnnotations()) { + for (Annotation annotation : getAnnotations(f)) { if (annotation.annotationType().equals(DataPoint.class)) datums.put(f,(DataPoint) annotation); } } } + + private Annotation[] getAnnotations(final Field field) { + final String fieldName = field.toString(); + Annotation[] annotations = annotationCache.get(fieldName); + if ( annotations == null ) { + annotations = field.getAnnotations(); + annotationCache.put(fieldName, annotations); + } + return annotations; + } /** * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java index 09f2c0168..b5c6a1ecf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java @@ -36,6 +36,16 @@ public class NewEvaluationContext extends HashMap { return new TreeMap(evaluationInstances); } + public StateKey makeStateKey() { + Map map = new HashMap(size()); + + for (Map.Entry elt : this.entrySet() ) { + map.put(elt.getKey().getName(), elt.getValue()); + } + + return new StateKey(map); + } + public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { for ( final VariantEvaluator evaluation : evaluationInstances.values() ) { // the other updateN methods don't see a null context diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 96bd9a9b7..36b09300b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -3,25 +3,67 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; import java.util.Map; import java.util.TreeMap; -public class StateKey extends TreeMap { -// public int hashCode() { -// int hashCode = 1; -// -// for (final Map.Entry pair : this.entrySet()) { -// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); -// } -// -// return hashCode; -// } +/** + * A final constant class representing the specific state configuration + * for a VariantEvaluator instance. + * + * TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. + */ +public final class StateKey { + /** High-performance cache of the toString operation for a constant class */ + private final String string; + private final TreeMap states; - public String toString() { - String value = ""; + public StateKey(final Map states) { + this.states = new TreeMap(states); + this.string = formatString(); + } - for ( final String key : this.keySet() ) { - //value += "\tstate " + key + ":" + this.get(key) + "\n"; - value += String.format("%s:%s;", key, this.get(key)); + public StateKey(final StateKey toOverride, final String keyOverride, final String valueOverride) { + if ( toOverride == null ) { + this.states = new TreeMap(); + } else { + this.states = new TreeMap(toOverride.states); } - return value; + this.states.put(keyOverride, valueOverride); + this.string = formatString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final StateKey stateKey = (StateKey) o; + + if (states != null ? !states.equals(stateKey.states) : stateKey.states != null) return false; + + return true; + } + + @Override + public int hashCode() { + return states.hashCode(); + } + + @Override + public String toString() { + return string; + } + + private final String formatString() { + StringBuilder b = new StringBuilder(); + + for ( Map.Entry entry : states.entrySet() ) { + b.append(String.format("%s:%s;", entry.getKey(), entry.getValue())); + } + + return b.toString(); + } + + // TODO -- might be slow because of tree map + public String get(final String key) { + return states.get(key); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 91c7140e6..9b4ae129a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -214,20 +214,9 @@ public class VariantEvalUtils { ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec)); } } else { - HashMap necs = new HashMap(); - - StateKey stateKey = new StateKey(); - for (VariantStratifier vs : ec.keySet()) { - String state = ec.get(vs); - - stateKey.put(vs.getName(), state); - } - + final StateKey stateKey = ec.makeStateKey(); ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); - - necs.put(stateKey, ec); - - return necs; + return new HashMap(Collections.singletonMap(stateKey, ec)); } return ecs; @@ -428,14 +417,8 @@ public class VariantEvalUtils { HashMap> oneSetOfStates = newStateStack.pop(); VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); - for (String state : oneSetOfStates.get(vs)) { - StateKey newStateKey = new StateKey(); - if (stateKey != null) { - newStateKey.putAll(stateKey); - } - - newStateKey.put(vs.getName(), state); - + for (final String state : oneSetOfStates.get(vs)) { + final StateKey newStateKey = new StateKey(stateKey, vs.getName(), state); initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); } } else { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b3bd0253c..67cd40eea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -17,8 +17,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); + Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b")); executeTest("test Multiple SNP alleles", spec); } @@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { UserException.class); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("f08ff07ad49d388198c1887baad05977")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a")); + executeTest("test minIndelFraction 1.0", spec); + } }