From 34ea443cdbb071c435e22597171f6e6d663f63fa Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Mar 2012 16:28:02 -0400 Subject: [PATCH] Better algorithm for choosing which indel alleles are present in samples -- The previous approach (requiring > 5 copies among all reads) is breaking down in many samples (>1000) just from sequencing errors. -- This breakdown is producing spurious clustered indels (lots of these!) around real common indels -- The new approach requires >X% of reads in a sample to carry an indel of any type (no allele matching) to be including in the counting towards 5. This actually makes sense in that if you have enough data we expect most reads to have the indel, but the allele might be wrong because of alignment, etc. If you have very few reads, then the threshold is crossed with any indel containing read, and it's counted. -- As far as I can tell this is the right thing to do in general. We'll make another call set in ESP and see how it works at scale. -- Added integration tests to ensure that the system is behaving as I expect on the site I developed the code on from ESP --- .../genotyper/ConsensusAlleleCounter.java | 286 ++++++++++++++++++ ...elGenotypeLikelihoodsCalculationModel.java | 202 +------------ .../genotyper/UnifiedArgumentCollection.java | 12 + .../UnifiedGenotyperIntegrationTest.java | 39 ++- 4 files changed, 341 insertions(+), 198 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..3f03c2bb2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + private final GenomeLocParser locParser; + + public ConsensusAlleleCounter(final GenomeLocParser locParser, + final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + this.locParser = locParser; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); +// logger.info("Alleles at " + ref.getLocus()); +// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) { +// logger.info(" " + elt.getValue() + " => " + elt.getKey()); +// } + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for (Map.Entry sample : contexts.entrySet()) { + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) { +// if ( nIndelReads > 0 ) +// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + continue; +// } else { +// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + } + + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + +/* if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + */ + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 21f11d2ff..00d90e3f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,11 +52,9 @@ import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int HAPLOTYPE_SIZE; - private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; @@ -72,7 +70,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup private GenomeLoc lastSiteVisited; - private ArrayList alleleList; + private List alleleList = new ArrayList(); static { indelLikelihoodMap.set(new HashMap>()); @@ -83,205 +81,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - - private ArrayList computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { - Allele refAllele = null, altAllele = null; - GenomeLoc loc = ref.getLocus(); - ArrayList aList = new ArrayList(); - - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - insCount += indelPileup.getNumberOfInsertions(); - delCount += indelPileup.getNumberOfDeletions(); - } - - if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) - return aList; - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { - //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - -/* if (DEBUG && p.isIndel()) { - System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", - read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), - p.getEventLength(),p.getType().toString(), p.getEventBases()); - } - */ - - String indelString = p.getEventBases(); - if (p.isInsertion()) { - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (String s : consensusIndelStrings.keySet()) { - cList.add(new Pair(s,consensusIndelStrings.get(s))); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(indelString)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(indelString)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - cList.add(new Pair(indelString,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); - - } - } - - } - - Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - String bestAltAllele = ""; - - for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s), stop = 0; - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); - - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - if (Allele.acceptableAlleleBases(s)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(s, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - ArrayList vcAlleles = new ArrayList(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(ref.getBase()); - builder.noGenotypes(); - if (doMultiAllelicCalls) - vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - - } - } - - if (vcs.isEmpty()) - return aList; // nothing else to do, no alleles passed minimum count criterion - - VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); - - aList = new ArrayList(mergedVC.getAlleles()); - - return aList; - + private List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenomeLocParser locParser) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 82e411c25..7b8e5c897 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -118,6 +118,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute + * to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end + * up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for + * low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over + * come this threshold. + */ + @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) + public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; + + /** * This argument informs the prior probability of having an indel at a site. */ @@ -165,6 +176,7 @@ public class UnifiedArgumentCollection { uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; + uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b3bd0253c..67cd40eea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -17,8 +17,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); + Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b")); executeTest("test Multiple SNP alleles", spec); } @@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { UserException.class); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("f08ff07ad49d388198c1887baad05977")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a")); + executeTest("test minIndelFraction 1.0", spec); + } }