diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java old mode 100755 new mode 100644 index 763401fac..878e2e317 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2010, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,15 +12,14 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.annotator; @@ -34,20 +33,16 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.*; import java.util.*; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; -// todo -- rename to haplotype penalty public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; - - // if true, we compute a second haplotype from the reads, instead of constraining ourselves to the reference - // as a haplotype itself - private final static boolean USE_NON_REFERENCE_SECOND_HAPLOTYPE = true; + private final static String REGEXP_WILDCARD = "."; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( !vc.isBiallelic() || !vc.isSNP() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here @@ -58,13 +53,14 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); int contextSize = contextWingSize * 2 + 1; - // calculate - Haplotype refHaplotype = calcRefHaplotype(vc, ref, context, contextSize); - Haplotype altHaplotype = new Haplotype(getPileupOfAllele(vc.getAlternateAllele(0), context.getBasePileup()), contextSize); + // Compute all haplotypes consistent with the current read pileup + List haplotypes = computeHaplotypes(context.getBasePileup(), contextSize); - //System.exit(1); // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - double score = scoreReadsAgainstHaplotypes(Arrays.asList(refHaplotype, altHaplotype), context.getBasePileup(), contextSize); + double score = 0.0; + + if (haplotypes != null) + score = scoreReadsAgainstHaplotypes(haplotypes, context.getBasePileup(), contextSize); // return the score Map map = new HashMap(); @@ -72,37 +68,184 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return map; } - // todo -- note that the refPileup.size() won't deal correctly with the situation where the current site is hom-var - // todo -- but there's nearby het size. In order to really handle this we need to group reads into two clusters, - // todo -- but if we are going to do this we might as well just assemble the whole region - public Haplotype calcRefHaplotype(VariantContext vc, ReferenceContext ref, AlignmentContext context, int contextSize) { - ReadBackedPileup refPileup = getPileupOfAllele(vc.getReference(), context.getBasePileup()); - if ( USE_NON_REFERENCE_SECOND_HAPLOTYPE && refPileup.size() > 0 ) { - // we are calculating the reference haplotype from the reads itself -- effectively allows us to - // have het haplotypes that are hom-var in the surrounding context, indicating that the individual - // as two alt haplotypes - return new Haplotype(refPileup, contextSize); - } else { - // we are constraining the reference haplotype to really be the reference itself - int contextWingSize = (contextSize - 1) / 2; - int refMiddle = (int)(ref.getWindow().size() - 1) / 2; - int refStart = refMiddle - contextWingSize; - int refStop = refMiddle + contextWingSize + 1; - String refString = new String(ref.getBases()).substring(refStart, refStop); - return new Haplotype(refString.getBytes(), 60); + private class HaplotypeComparator implements Comparator{ + + public int compare(Haplotype a, Haplotype b) { + if (a.getQualitySum() < b.getQualitySum()) + return 1; + if (a.getQualitySum() > b.getQualitySum()){ + return -1; + } + return 0; } } + + private List computeHaplotypes(ReadBackedPileup pileup, int contextSize) { + // Compute all possible haplotypes consistent with current pileup + ArrayList haplotypeList = new ArrayList(); + PriorityQueue haplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); + + + for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) { + if (ReadUtils.is454Read(p.getRead())) + continue; + Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize); + + + haplotypeQueue.add(haplotypeFromRead); + //haplotypeList.add(haplotypeFromRead); + } + + // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes + Haplotype elem; + while ((elem = haplotypeQueue.poll()) != null) { + //System.out.print("element: "+elem.toString()); + //System.out.format(" SumQual = %f\n", elem.getQualitySum()); + boolean foundHaplotypeMatch = false; + //Haplotype[] remainingHaplotypes = haplotypeQueue.toArray(new Haplotype[haplotypeQueue.size()]); + for ( Haplotype haplotypeFromList : haplotypeList ) { + + Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); + //System.out.format("-Checking consensus for %s:", haplotypeFromList.toString()); + if (consensusHaplotype != null) { + //System.out.format("--Consensus haplotype = %s, qual = %f\n", consensusHaplotype.toString(), consensusHaplotype.getQualitySum()); + foundHaplotypeMatch = true; + if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) { + haplotypeList.remove(haplotypeFromList); + haplotypeList.add(consensusHaplotype); + } + break; + } + /* else { + System.out.println("no consensus found"); + } + */ + } + + if (!foundHaplotypeMatch) { + haplotypeList.add(elem); + } + } + // Now retrieve two most popular haplotypes + // TODO - quick and dirty solution, could use better data structures to do this automatically + int bestIdx=0, secondBestIdx=0; + double bestIdxVal=-1.0, secondBestIdxVal = -1.0; + + for (int k=0; k < haplotypeList.size(); k++) { + + double qualSum = haplotypeList.get(k).getQualitySum(); + if (qualSum >= bestIdxVal) { + secondBestIdx = bestIdx; + secondBestIdxVal = bestIdxVal; + bestIdx = k; + bestIdxVal = qualSum; + } + else if (qualSum >= secondBestIdxVal) { + // check if current is second best + secondBestIdx = k; + secondBestIdxVal = qualSum; + } + } + if (haplotypeList.size() > 0) { + Haplotype haplotypeR = haplotypeList.get(bestIdx); + Haplotype haplotypeA = haplotypeList.get(secondBestIdx); + + // Temp hack to match old implementation's scaling, TBD better behavior + + return Arrays.asList(new Haplotype(haplotypeR.bases, 60), new Haplotype(haplotypeA.bases, contextSize)); + } + else + return null; + } + + private Haplotype getHaplotypeFromRead(ExtendedPileupElement p, int contextSize) { + SAMRecord read = p.getRead(); + int readOffsetFromPileup = p.getOffset(); + int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + byte[] haplotypeBases = new byte[contextSize]; + + for(int i=0; i < contextSize; i++) { + haplotypeBases[i] = REGEXP_WILDCARD.getBytes()[0]; + } + + double[] baseQualities = new double[contextSize]; + Arrays.fill(baseQualities,0.0); + + for (int i = 0; i < contextSize; i++ ) { + int baseOffset = i + baseOffsetStart; + if ( baseOffset < 0 ) + continue; + if ( baseOffset >= read.getReadLength() ) + break; + + haplotypeBases[i] = read.getReadBases()[baseOffset]; + baseQualities[i] = (double)read.getBaseQualities()[baseOffset]; + } + + + return new Haplotype(haplotypeBases, baseQualities); + + } + + + + private Haplotype getConsensusHaplotype(Haplotype haplotypeA, Haplotype haplotypeB) { + String a = haplotypeA.toString(); + String b = haplotypeB.toString(); + + if (a.length() != b.length()) + throw new StingException("Haplotypes a and b must be of same length"); + + char chA, chB; + char wc = REGEXP_WILDCARD.charAt(0); + + + char[] consensusChars = new char[a.length()]; + double[] consensusQuals = new double[a.length()]; + + for (int i=0; i < a.length(); i++) { + chA = a.charAt(i); + chB = b.charAt(i); + + if ((chA != chB) && (chA != wc) && (chB != wc)) + return null; + + if ((chA == wc) && (chB == wc)) { + consensusChars[i] = wc; + consensusQuals[i] = 0.0; + } + else if ((chA == wc)) { + consensusChars[i] = chB; + consensusQuals[i] = haplotypeB.quals[i]; + } + else if ((chB == wc)){ + consensusChars[i] = chA; + consensusQuals[i] = haplotypeA.quals[i]; + } else { + consensusChars[i] = chA; + consensusQuals[i] = haplotypeA.quals[i]+haplotypeB.quals[i]; + } + + + } + + + return new Haplotype(new String(consensusChars), consensusQuals); + } // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes private double scoreReadsAgainstHaplotypes(List haplotypes, ReadBackedPileup pileup, int contextSize) { - if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); - if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1)); +// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); +// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1)); double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()]; for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) { SAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); + if (ReadUtils.is454Read(read)) + continue; + if ( DEBUG ) System.out.printf("--------------------------------------------- Read %s%n", read.getReadName()); double m = 10000000; for ( int i = 0; i < haplotypes.size(); i++ ) { @@ -172,114 +315,40 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { private class Haplotype { byte[] bases = null; - byte[] quals = null; + double[] quals = null; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * - * @param bases - * @param qual + * @param bases bases + * @param qual qual */ Haplotype(byte[] bases, int qual) { this.bases = bases; - quals = new byte[bases.length]; - Arrays.fill(quals, (byte)qual); + quals = new double[bases.length]; + Arrays.fill(quals, (double)qual); } - Haplotype(ReadBackedPileup pileup, int contextSize ) { - this.bases = new byte[contextSize]; - this.quals = new byte[contextSize]; - calculateConsensusOverWindow(pileup, contextSize, (contextSize - 1) / 2); + Haplotype(byte[] bases, double[] quals) { + this.bases = bases; + this.quals = quals; } - private void calculateConsensusOverWindow(ReadBackedPileup pileup, int contextSize, int pileupOffset) { - // for each context position - for ( int i = 0; i < contextSize; i++ ) { - int offsetFromPileup = i - pileupOffset; - ReadBackedPileup offsetPileup = pileupAtOffset(pileup, offsetFromPileup); - if ( DEBUG ) System.out.printf("pileup is %s%n", offsetPileup); - BaseQual bq = calcConsensusAtLocus(offsetPileup, FLAT_BASE_PRIORS); - this.bases[i] = bq.getBase(); - this.quals[i] = bq.getQual(); - if ( DEBUG ) System.out.printf(" At %d: offset %d bq = %c / %d%n", i, offsetFromPileup, (char)bq.getBase(), bq.getQual()); - } + Haplotype(String bases, double[] quals) { + this.bases = bases.getBytes(); + this.quals = quals; } - - private BaseQual calcConsensusAtLocus( ReadBackedPileup pileup, double[] log10priors ) { - double[] log10BaseLikelihoods = new double[BaseUtils.Base.values().length]; - - // loop over a, c, g, t and determine the most likely hypothesis - for ( BaseUtils.Base base : BaseUtils.Base.values() ) { - double log10L = log10priors[base.getIndex()]; - - for ( PileupElement p : pileup ) { - byte qual = p.getQual(); - if ( qual > 5 ) { - double baseP = QualityUtils.qualToProb(qual); - double L = base.sameBase(p.getBase()) ? baseP : 1 - baseP; - if ( Double.isInfinite(Math.log10(L)) ) - throw new StingException("BUG -- base likelihood is infinity!"); - log10L += Math.log10(L); - } - } - - log10BaseLikelihoods[base.getIndex()] = log10L; - } - - double[] posteriors = MathUtils.normalizeFromLog10(log10BaseLikelihoods, false); - int mostLikelyIndex = MathUtils.maxElementIndex(posteriors); - byte mostLikelyBase = BaseUtils.Base.values()[mostLikelyIndex].getBase(); // get the most likely option - double MAX_CONSENSUS_QUALITY = 0.000001; - byte qual = QualityUtils.probToQual(posteriors[mostLikelyIndex],MAX_CONSENSUS_QUALITY); // call posterior calculator here over L over bases - return new BaseQual(mostLikelyBase, qual); - } - public String toString() { return new String(this.bases); } - } - private static class BaseQual extends Pair { - public BaseQual(byte base, byte qual) { - super(base, qual); - } - - public byte getBase() { return getFirst(); } - public byte getQual() { return getSecond(); } - } - - private static ReadBackedPileup pileupAtOffset(ReadBackedPileup pileup, int offsetFromPileup) { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - // go through the pileup, read by read, collecting up the context at offset - for ( PileupElement p : pileup ) { - // not safe -- doesn't work for indel-containing reads! - - // whole algorithm should be restructured to handle other reads in the window or use LocusIteratorByState - SAMRecord read = p.getRead(); - int readOffsetInPileup = p.getOffset(); - int neededReadOffset = readOffsetInPileup + offsetFromPileup; - if ( neededReadOffset >= 0 && neededReadOffset < read.getReadLength() ) { - reads.add(p.getRead()); - offsets.add(neededReadOffset); + double getQualitySum() { + double s = 0; + for (int k=0; k < bases.length; k++) { + s += quals[k]; } + return s; } - - return new ReadBackedPileupImpl(pileup.getLocation(), reads, offsets); - } - - private static ReadBackedPileup getPileupOfAllele( Allele allele, ReadBackedPileup pileup ) { - ArrayList filteredPileup = new ArrayList(); - byte alleleBase = allele.getBases()[0]; // assumes SNP - - for ( PileupElement p : pileup ) { - if ( p.getBase() == alleleBase ) { - filteredPileup.add(p); - } - } - - return new ReadBackedPileupImpl(pileup.getLocation(), filteredPileup); } public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with two (and only two) segregating haplotypes")); } -} \ No newline at end of file +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MyHaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MyHaplotypeScore.java deleted file mode 100644 index 326c9cd2a..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MyHaplotypeScore.java +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broad.tribble.vcf.VCFHeaderLineType; -import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.*; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.pileup.*; - -import java.util.*; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -public class MyHaplotypeScore implements InfoFieldAnnotation { - private final static boolean DEBUG = false; - private final static int MIN_CONTEXT_WING_SIZE = 10; - private final static String REGEXP_WILDCARD = "."; - - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.isBiallelic() || !vc.isSNP() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here - return null; - - AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values()); - - int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); - int contextSize = contextWingSize * 2 + 1; - - // Compute all haplotypes consistent with the current read pileup - List haplotypes = computeHaplotypes(context.getBasePileup(), contextSize); - - // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - double score = 0.0; - - if (haplotypes != null) - score = scoreReadsAgainstHaplotypes(haplotypes, context.getBasePileup(), contextSize); - - // return the score - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", score)); - return map; - } - - private class HaplotypeComparator implements Comparator{ - - public int compare(Haplotype a, Haplotype b) { - if (a.getQualitySum() < b.getQualitySum()) - return 1; - if (a.getQualitySum() > b.getQualitySum()){ - return -1; - } - return 0; - } - } - - - private List computeHaplotypes(ReadBackedPileup pileup, int contextSize) { - // Compute all possible haplotypes consistent with current pileup - ArrayList haplotypeList = new ArrayList(); - ArrayList readsPerHaplotype = new ArrayList(); - PriorityQueue haplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); - - - for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) { - if (ReadUtils.is454Read(p.getRead())) - continue; - Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize); - - - haplotypeQueue.add(haplotypeFromRead); - //haplotypeList.add(haplotypeFromRead); - } - - // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes - Haplotype elem; - while ((elem = haplotypeQueue.poll()) != null) { - //System.out.print("element: "+elem.toString()); - //System.out.format(" SumQual = %f\n", elem.getQualitySum()); - boolean foundHaplotypeMatch = false; - //Haplotype[] remainingHaplotypes = haplotypeQueue.toArray(new Haplotype[haplotypeQueue.size()]); - for ( Haplotype haplotypeFromList : haplotypeList ) { - - Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); - //System.out.format("-Checking consensus for %s:", haplotypeFromList.toString()); - if (consensusHaplotype != null) { - //System.out.format("--Consensus haplotype = %s, qual = %f\n", consensusHaplotype.toString(), consensusHaplotype.getQualitySum()); - foundHaplotypeMatch = true; - if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) { - haplotypeList.remove(haplotypeFromList); - haplotypeList.add(consensusHaplotype); - } - break; - } - /* else { - System.out.println("no consensus found"); - } - */ - } - - if (!foundHaplotypeMatch) { - haplotypeList.add(elem); - } - } - // Now retrieve two most popular haplotypes - // TODO - quick and dirty solution, could use better data structures to do this automatically - int bestIdx=0, secondBestIdx=0; - double bestIdxVal=-1.0, secondBestIdxVal = -1.0; - - for (int k=0; k < haplotypeList.size(); k++) { - - double qualSum = haplotypeList.get(k).getQualitySum(); - if (qualSum >= bestIdxVal) { - secondBestIdx = bestIdx; - secondBestIdxVal = bestIdxVal; - bestIdx = k; - bestIdxVal = qualSum; - } - else if (qualSum >= secondBestIdxVal) { - // check if current is second best - secondBestIdx = k; - secondBestIdxVal = qualSum; - } - } - if (haplotypeList.size() > 0) { - Haplotype haplotypeR = haplotypeList.get(bestIdx); - Haplotype haplotypeA = haplotypeList.get(secondBestIdx); - - // Temp hack to match old implementation's scaling, TBD better behavior - - return Arrays.asList(new Haplotype(haplotypeR.bases, 60), new Haplotype(haplotypeA.bases, contextSize)); - } - else - return null; - } - - private Haplotype getHaplotypeFromRead(ExtendedPileupElement p, int contextSize) { - SAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); - int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; - byte[] haplotypeBases = new byte[contextSize]; - - for(int i=0; i < contextSize; i++) { - haplotypeBases[i] = REGEXP_WILDCARD.getBytes()[0]; - } - - double[] baseQualities = new double[contextSize]; - Arrays.fill(baseQualities,0.0); - - for (int i = 0; i < contextSize; i++ ) { - int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) - continue; - if ( baseOffset >= read.getReadLength() ) - break; - - haplotypeBases[i] = read.getReadBases()[baseOffset]; - baseQualities[i] = (double)read.getBaseQualities()[baseOffset]; - } - - - return new Haplotype(haplotypeBases, baseQualities); - - } - - - - private Haplotype getConsensusHaplotype(Haplotype haplotypeA, Haplotype haplotypeB) { - String a = haplotypeA.toString(); - String b = haplotypeB.toString(); - - if (a.length() != b.length()) - throw new StingException("Haplotypes a and b must be of same length"); - - char chA, chB; - char wc = REGEXP_WILDCARD.charAt(0); - - - char[] consensusChars = new char[a.length()]; - double[] consensusQuals = new double[a.length()]; - - for (int i=0; i < a.length(); i++) { - chA = a.charAt(i); - chB = b.charAt(i); - - if ((chA != chB) && (chA != wc) && (chB != wc)) - return null; - - if ((chA == wc) && (chB == wc)) { - consensusChars[i] = wc; - consensusQuals[i] = 0.0; - } - else if ((chA == wc) && (chB != wc)) { - consensusChars[i] = chB; - consensusQuals[i] = haplotypeB.quals[i]; - } - else if ((chA != wc) && (chB == wc)){ - consensusChars[i] = chA; - consensusQuals[i] = haplotypeA.quals[i]; - } else { - consensusChars[i] = chA; - consensusQuals[i] = haplotypeA.quals[i]+haplotypeB.quals[i]; - } - - - } - - - return new Haplotype(new String(consensusChars), consensusQuals); - } - // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - private double scoreReadsAgainstHaplotypes(List haplotypes, ReadBackedPileup pileup, int contextSize) { -// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); -// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1)); - - double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()]; - for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) { - SAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); - - if (ReadUtils.is454Read(read)) - continue; - - if ( DEBUG ) System.out.printf("--------------------------------------------- Read %s%n", read.getReadName()); - double m = 10000000; - for ( int i = 0; i < haplotypes.size(); i++ ) { - Haplotype haplotype = haplotypes.get(i); - int start = readOffsetFromPileup - (contextSize - 1)/2; - double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype); - haplotypeScores[p.getPileupOffset()][i] = score; - if ( DEBUG ) System.out.printf(" vs. haplotype %d = %f%n", i, score); - m = Math.min(score, m); - } - if ( DEBUG ) System.out.printf(" => best score was %f%n", m); - } - - double overallScore = 0.0; - for ( double[] readHaplotypeScores : haplotypeScores ) { - overallScore += MathUtils.arrayMin(readHaplotypeScores); - } - - return overallScore; - } - - private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Haplotype haplotype ) { - double expected = 0.0; - double mismatches = 0.0; - - // What's the expected mismatch rate under the model that this read is actually sampled from - // this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, and that - // the observed base is actually from a c with an error rate e. Since e is the rate at which we'd - // see a miscalled c, the expected mismatch rate is really e. So the expected number of mismatches - // is just sum_i e_i for i from 1..n for n sites - // - // Now, what's the probabilistic sum of mismatches? Suppose that the base b is equal to c. Well, it could - // actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != c, then - // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. - // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n - for ( int i = 0; i < contextSize; i++ ) { - int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) - continue; - if ( baseOffset >= read.getReadLength() ) - break; - - byte haplotypeBase = haplotype.bases[i]; - byte readBase = read.getReadBases()[baseOffset]; - - boolean matched = BaseUtils.basesAreEqual(readBase, haplotypeBase ); - double e = QualityUtils.qualToErrorProb(read.getBaseQualities()[baseOffset]); - expected += e; - mismatches += matched ? e : 1 - e / 3; - - // a more sophisticated calculation would include the reference quality, but it's nice to actually penalize - // the mismatching of poorly determined regions of the consensus - - if ( DEBUG ) System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n", - read.getReadName(), (char)haplotypeBase, (char)readBase, e, read.getBaseQualities()[baseOffset], expected, mismatches); - } - - return mismatches - expected; - } - - - private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length]; - static { - for ( int i = 0; i < BaseUtils.Base.values().length; i++ ) - FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length); - } - - private class Haplotype { - byte[] bases = null; - double[] quals = null; - - /** - * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual - * - * @param bases - * @param qual - */ - Haplotype(byte[] bases, int qual) { - this.bases = bases; - quals = new double[bases.length]; - Arrays.fill(quals, (double)qual); - } - - Haplotype(byte[] bases, double[] quals) { - this.bases = bases; - this.quals = quals; - } - - Haplotype(String bases, double[] quals) { - this.bases = bases.getBytes(); - this.quals = quals; - } - public String toString() { return new String(this.bases); } - - double getQualitySum() { - double s = 0; - for (int k=0; k < bases.length; k++) { - s += quals[k]; - } - return s; - } - } - - public List getKeyNames() { return Arrays.asList("MyHaplotypeScore"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MyHaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with two (and only two) segregating haplotypes")); } -} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 922dc7c9a..5d515422b 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("184fc1f99dfba3e2519d16458d728fcc")); + Arrays.asList("674541eb78fa6e4f9bee172b3f34bbab")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -39,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("12a069d5f1c9cd3970ea6301397219aa")); + Arrays.asList("985c55e3f0c41082dc56f7a291ef307a")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B variant,VCF," + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("8f42df642b329ff19bc2c39470117280")); + Arrays.asList("c789e8f795cf4b182f717423bf3328f2")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -71,7 +71,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B variant,VCF," + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("908aa4b6ac65bee57f91bc6fed4d46ad")); + Arrays.asList("be18a3b9589ea60350fbaf8f7e1dd769")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 2e4bb0529..8d6a402f2 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("3a402233264e21a84d421e3a4ea64768")); + Arrays.asList("0c1a5f6372f6d17544a95292bd313c86")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList("79736b3e955a16b30f827b2786fc08b1")); + Arrays.asList("049c98bc6db7029995f8893f570dd9ad")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("b8d93c6fcb4b17d454cdcbfc4b43f076")); + Arrays.asList("6c278b09727cd14523423a6beda627a5")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -63,7 +63,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testParallelization() { - String md5 = "098802639cfab1b777c96d38376f118a"; + String md5 = "c66c55363fb9e951569667e3ff6eb728"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,075,000", 1, @@ -90,11 +90,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "b87f28a772eb75c8dad9062c6d039da5" ); - e.put( "-all_bases", "ec9de2044cd5f5901d6879b56f12993a" ); - e.put( "--min_base_quality_score 26", "875c64a64fd402626e04c9540388c483" ); - e.put( "--min_mapping_quality_score 26", "de7d90e425f8f08f219dc91a25d60c68" ); - e.put( "--max_mismatches_in_40bp_window 5", "758e312b2a2e7c4d83f60174d43fac8a" ); + e.put( "-genotype", "dfe1a220ee0966a20af4a2fca9e635fe" ); + e.put( "-all_bases", "e7b766485dd37631b3fcf9f655588456" ); + e.put( "--min_base_quality_score 26", "e4e318853e091e63166eaef648ec26ac" ); + e.put( "--min_mapping_quality_score 26", "57fffb32a3a7a736f6c30ae08ef71c91" ); + e.put( "--max_mismatches_in_40bp_window 5", "4f7a5c22247234c4ee3ca2cfbbc16729" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -108,12 +108,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("4937bab94b0bae1aa61cdf3a06cb49e8")); + Arrays.asList("8b4ebcbf330340a437e572a73a99227b")); executeTest("testConfidence1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("f8b722dad5c4868a4bba246eef83f96d")); + Arrays.asList("0d8825a98a8918bed2c02ac44e15dde7")); executeTest("testConfidence2", spec2); }