diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index c76082e3e..3eb40685f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -9,8 +9,7 @@ import java.util.List; import java.util.Arrays; -public class BaseQualityRankSumTest /*extends RankSumTest*/ { - // todo -- seems math in this test is dubious, need to recheck and verify (p-values wildly divergent from R or MATLAB) +public class BaseQualityRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Phred-scaled p-value From Wilcoxon Rank Sum Test of Het Vs. Ref Base Qualities")); } @@ -21,10 +20,11 @@ public class BaseQualityRankSumTest /*extends RankSumTest*/ { if ( p.isDeletion() ) continue; - if ( p.getBase() == ref ) + if ( p.getBase() == ref ) { refQuals.add((int)p.getQual()); - else if ( (char)p.getBase() == alt ) + } else if ( (char)p.getBase() == alt ) { altQuals.add((int)p.getQual()); + } } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 36093f129..ad2bcdbbb 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; @@ -39,49 +40,51 @@ import org.broadinstitute.sting.utils.pileup.*; import java.util.*; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { private final static boolean DEBUG = false; - private final static int MIN_CONTEXT_WING_SIZE = 10; - private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 20; + private final static int MIN_CONTEXT_WING_SIZE = 20; + private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; public boolean useRead(PileupElement p) { - return ! ReadUtils.is454Read(p.getRead()); - // TODO -- warning, min. mapping quality fixed to constant 1 here for indels. How can I determine the min. mapping quality? - // if ( ReadUtils.is454Read(p.getRead()) ) // we're not a 454 read - // return false; - // else if ( p.getOffset() == -1 ) // indel - // return p.getRead().getMappingQuality() > 0; - // else if ( ! (p.getRead() instanceof GATKSAMRecord) ) // we're not a GATKSamRecord, so we can't filter - // return true; - // else - // return ((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset()); // we used the base to call the variant + //return true; // Use all reads + //return !ReadUtils.is454Read(p.getRead()); // Use all non-454 reads + return p.getOffset() == -1 || ((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset()); // Use all reads from the filtered context } public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( !vc.isBiallelic() || !vc.isSNP() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here return null; + + final AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values()); - AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values()); + final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); + final int contextSize = contextWingSize * 2 + 1; - int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); - int contextSize = contextWingSize * 2 + 1; + final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2; // Compute all haplotypes consistent with the current read pileup - List haplotypes = computeHaplotypes(context.getBasePileup(), contextSize); + final List haplotypes = computeHaplotypes(context.getBasePileup(), contextSize, locus); - // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - double score = 0.0; + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + if (haplotypes != null) { + final Set> genotypes = vc.getGenotypes().entrySet(); + for ( final Map.Entry genotype : genotypes ) { + final StratifiedAlignmentContext thisContext = stratifiedContexts.get(genotype.getKey()); + if ( thisContext != null ) { + double thisScore = scoreReadsAgainstHaplotypes(haplotypes, thisContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), contextSize, locus); + scoreRA.add(thisScore); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + } + } - if (haplotypes != null) - score = scoreReadsAgainstHaplotypes(haplotypes, context.getBasePileup(), contextSize); - - // return the score - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", score)); + // annotate the score in the info field + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean())); return map; } @@ -97,102 +100,89 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - private List computeHaplotypes(ReadBackedPileup pileup, int contextSize) { + private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus) { // Compute all possible haplotypes consistent with current pileup - ArrayList haplotypeList = new ArrayList(); - PriorityQueue haplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); + final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); + final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); - for ( PileupElement p : pileup ) { + for ( final PileupElement p : pileup ) { if ( useRead(p) ) { - Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize); - haplotypeQueue.add(haplotypeFromRead); + final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus); + candidateHaplotypeQueue.add(haplotypeFromRead); } } // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes Haplotype elem; - while ((elem = haplotypeQueue.poll()) != null) { - //System.out.print("element: "+elem.toString()); - //System.out.format(" SumQual = %f\n", elem.getQualitySum()); + while ((elem = candidateHaplotypeQueue.poll()) != null) { boolean foundHaplotypeMatch = false; - //Haplotype[] remainingHaplotypes = haplotypeQueue.toArray(new Haplotype[haplotypeQueue.size()]); - for ( Haplotype haplotypeFromList : haplotypeList ) { - - Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); - //System.out.format("-Checking consensus for %s:", haplotypeFromList.toString()); + Haplotype lastCheckedHaplotype = null; + for ( final Haplotype haplotypeFromList : consensusHaplotypeQueue ) { + final Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); if (consensusHaplotype != null) { - //System.out.format("--Consensus haplotype = %s, qual = %f\n", consensusHaplotype.toString(), consensusHaplotype.getQualitySum()); foundHaplotypeMatch = true; if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) { - haplotypeList.remove(haplotypeFromList); - haplotypeList.add(consensusHaplotype); + consensusHaplotypeQueue.remove(haplotypeFromList); + consensusHaplotypeQueue.add(consensusHaplotype); } break; } - /* else { - System.out.println("no consensus found"); + else { + lastCheckedHaplotype = haplotypeFromList; } - */ } - if (!foundHaplotypeMatch && haplotypeList.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) { - haplotypeList.add(elem); + if (!foundHaplotypeMatch && consensusHaplotypeQueue.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) { + consensusHaplotypeQueue.add(elem); + } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum() ) { + consensusHaplotypeQueue.remove(lastCheckedHaplotype); + consensusHaplotypeQueue.add(elem); } } - // Now retrieve two most popular haplotypes - // TODO - quick and dirty solution, could use better data structures to do this automatically - int bestIdx=0, secondBestIdx=0; - double bestIdxVal=-1.0, secondBestIdxVal = -1.0; - for (int k=0; k < haplotypeList.size(); k++) { - - double qualSum = haplotypeList.get(k).getQualitySum(); - if (qualSum >= bestIdxVal) { - secondBestIdx = bestIdx; - secondBestIdxVal = bestIdxVal; - bestIdx = k; - bestIdxVal = qualSum; - } - else if (qualSum >= secondBestIdxVal) { - // check if current is second best - secondBestIdx = k; - secondBestIdxVal = qualSum; - } - } - if (haplotypeList.size() > 0) { - Haplotype haplotypeR = haplotypeList.get(bestIdx); - Haplotype haplotypeA = haplotypeList.get(secondBestIdx); - //System.out.format("%d %d\n",bestIdx, secondBestIdx); - // Temp hack to match old implementation's scaling, TBD better behavior - - return Arrays.asList(new Haplotype(haplotypeR.getBasesAsBytes(), 60), new Haplotype(haplotypeA.getBasesAsBytes(), contextSize)); - } - else + // Now retrieve the two most popular haplotypes + if (consensusHaplotypeQueue.size() > 0) { + // Since the consensus haplotypes are in a quality-ordered priority queue, the two best haplotypes are just the first two in the queue + final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); + Haplotype haplotype2 = consensusHaplotypeQueue.poll(); + if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found + return Arrays.asList(new Haplotype(haplotype1.getBasesAsBytes(), 60), new Haplotype(haplotype2.getBasesAsBytes(), 20)); // These qual values aren't used for anything + } else { return null; + } } - private Haplotype getHaplotypeFromRead(PileupElement p, int contextSize) { - SAMRecord read = p.getRead(); + private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { + final SAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); - int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; - byte[] haplotypeBases = new byte[contextSize]; + final byte[] haplotypeBases = new byte[contextSize]; for(int i=0; i < contextSize; i++) { haplotypeBases[i] = (byte)REGEXP_WILDCARD; } - double[] baseQualities = new double[contextSize]; - Arrays.fill(baseQualities,0.0); + final double[] baseQualities = new double[contextSize]; + Arrays.fill(baseQualities, 0.0); + + byte[] readBases = read.getReadBases(); + readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string + byte[] readQuals = read.getBaseQualities(); + readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string + + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); for (int i = 0; i < contextSize; i++ ) { - int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) + final int baseOffset = i + baseOffsetStart; + if ( baseOffset < 0 ) { continue; - if ( baseOffset >= readBases.length ) + } + if ( baseOffset >= readBases.length ) { break; - + } + if( readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } + if( readBases[baseOffset] == BaseUtils.N ) { readBases[baseOffset] = (byte)REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 4; } // N's shouldn't be treated as distinct bases + if( ((double)readQuals[baseOffset]) < 4.0 ) { readQuals[baseOffset] = (byte) 4; } // quals less than 4 are used as codes and don't have actual probabilistic meaning behind them haplotypeBases[i] = readBases[baseOffset]; baseQualities[i] = (double)readQuals[baseOffset]; } @@ -200,19 +190,20 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return new Haplotype(haplotypeBases, baseQualities); } - private Haplotype getConsensusHaplotype(Haplotype haplotypeA, Haplotype haplotypeB) { + private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplotype haplotypeB) { final byte[] a = haplotypeA.getBasesAsBytes(); final byte[] b = haplotypeB.getBasesAsBytes(); - if (a.length != b.length) + if (a.length != b.length) { throw new ReviewedStingException("Haplotypes a and b must be of same length"); + } byte chA, chB; - byte wc = (byte)REGEXP_WILDCARD; + final byte wc = (byte)REGEXP_WILDCARD; final int length = a.length; - byte[] consensusChars = new byte[length]; - double[] consensusQuals = new double[length]; + final byte[] consensusChars = new byte[length]; + final double[] consensusQuals = new double[length]; final double[] qualsA = haplotypeA.getQuals(); final double[] qualsB = haplotypeB.getQuals(); @@ -243,45 +234,34 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return new Haplotype(consensusChars, consensusQuals); } + // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - private double scoreReadsAgainstHaplotypes(List haplotypes, ReadBackedPileup pileup, int contextSize) { + private double scoreReadsAgainstHaplotypes(final List haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) { // if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); -// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1)); +// if ( DEBUG ) System.out.printf("HAP2: %s%n", haplotypes.get(1)); - double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()]; - int pileupOffset = 0; - for ( PileupElement p : pileup ) { - if ( useRead(p) ) { - SAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); - - if (ReadUtils.is454Read(read)) - continue; - - if ( DEBUG ) System.out.printf("--------------------------------------------- Read %s%n", read.getReadName()); - double m = 10000000; - for ( int i = 0; i < haplotypes.size(); i++ ) { - Haplotype haplotype = haplotypes.get(i); - int start = readOffsetFromPileup - (contextSize - 1)/2; - double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype); - haplotypeScores[pileupOffset][i] = score; - if ( DEBUG ) System.out.printf(" vs. haplotype %d = %f%n", i, score); - m = Math.min(score, m); - } - if ( DEBUG ) System.out.printf(" => best score was %f%n", m); + final ArrayList haplotypeScores = new ArrayList(); + for ( final PileupElement p : pileup ) { + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[haplotypes.size()]; + for ( int i = 0; i < haplotypes.size(); i++ ) { + final Haplotype haplotype = haplotypes.get(i); + final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus); + scores[i] = score; + if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i, score); } } - pileupOffset++; // todo -- remove me + haplotypeScores.add(scores); } double overallScore = 0.0; - for ( double[] readHaplotypeScores : haplotypeScores ) { + for ( final double[] readHaplotypeScores : haplotypeScores ) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } return overallScore; } - private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Haplotype haplotype ) { + private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus ) { double expected = 0.0; double mismatches = 0.0; @@ -296,41 +276,49 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n final byte[] haplotypeBases = haplotype.getBasesAsBytes(); - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); + final SAMRecord read = p.getRead(); + byte[] readBases = read.getReadBases(); + + readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string + byte[] readQuals = read.getBaseQualities(); + readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string + int readOffsetFromPileup = p.getOffset(); + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + for ( int i = 0; i < contextSize; i++ ) { - int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) + final int baseOffset = i + baseOffsetStart; + if ( baseOffset < 0 ) { continue; - if ( baseOffset >= readBases.length ) + } + if ( baseOffset >= readBases.length ) { break; + } - byte haplotypeBase = haplotypeBases[i]; - byte readBase = readBases[baseOffset]; + final byte haplotypeBase = haplotypeBases[i]; + final byte readBase = readBases[baseOffset]; - boolean matched = readBase == haplotypeBase; - double e = QualityUtils.qualToErrorProb(readQuals[baseOffset]); + + final boolean matched = (readBase == haplotypeBase); // Purposefully counting wildcards in the chosen haplotype as a mismatch + byte qual = readQuals[baseOffset]; + if( qual == PileupElement.DELETION_BASE ) { qual = PileupElement.DELETION_QUAL; } + if( ((double) qual) < 4.0 ) { qual = (byte) 4; } // quals less than 4 are used as codes and don't have actual probabilistic meaning behind them + final double e = QualityUtils.qualToErrorProb(qual); expected += e; - mismatches += matched ? e : 1 - e / 3; + mismatches += matched ? e : 1.0 - e / 3.0; // a more sophisticated calculation would include the reference quality, but it's nice to actually penalize // the mismatching of poorly determined regions of the consensus - if ( DEBUG ) System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n", - read.getReadName(), (char)haplotypeBase, (char)readBase, e, readQuals[baseOffset], expected, mismatches); +// if ( DEBUG ) { +// System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n", +// read.getReadName(), (char)haplotypeBase, (char)readBase, e, readQuals[baseOffset], expected, mismatches); +// } } return mismatches - expected; } - - private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length]; - static { - for ( int i = 0; i < BaseUtils.Base.values().length; i++ ) - FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length); - } - - public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with two (and only two) segregating haplotypes")); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 1c9c9312e..7036d15c6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.Arrays; -public class MappingQualityRankSumTest /*extends RankSumTest*/ { +public class MappingQualityRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("MQRankSum"); } @@ -21,10 +21,11 @@ public class MappingQualityRankSumTest /*extends RankSumTest*/ { if ( p.isDeletion() ) continue; - if ( p.getBase() == ref ) + if ( p.getBase() == ref ) { refQuals.add(p.getMappingQual()); - else if ( (char)p.getBase() == alt ) + } else if ( (char)p.getBase() == alt ) { altQuals.add(p.getMappingQual()); + } } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 68696fe04..a80ebc3f8 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -15,9 +15,9 @@ import java.util.Map; import java.util.HashMap; -public abstract class RankSumTest implements InfoFieldAnnotation, WorkInProgressAnnotation { +public abstract class RankSumTest implements InfoFieldAnnotation, ExperimentalAnnotation { private final static boolean DEBUG = false; - private static final double minPValue = 1e-10; + private static final double minPValue = 1e-20; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -30,45 +30,49 @@ public abstract class RankSumTest implements InfoFieldAnnotation, WorkInProgress if ( genotypes == null || genotypes.size() == 0 ) return null; - ArrayList refQuals = new ArrayList(); - ArrayList altQuals = new ArrayList(); + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); - for ( Map.Entry genotype : genotypes.entrySet() ) { - // we care only about het calls - if ( genotype.getValue().isHet() ) { - StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey()); + for ( final Map.Entry genotype : genotypes.entrySet() ) { + if ( !genotype.getValue().isHomRef() ) { + final StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey()); if ( context == null ) continue; fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).toString().charAt(0), context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), refQuals, altQuals); } } - - WilcoxonRankSum wilcoxon = new WilcoxonRankSum(); - for ( Integer qual : altQuals ) + final WilcoxonRankSum wilcoxon = new WilcoxonRankSum(); + for ( final Integer qual : altQuals ) { wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET1); - for ( Integer qual : refQuals ) + } + for ( final Integer qual : refQuals ) { wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2); - - // for R debugging - if ( DEBUG ) { - wilcoxon.DEBUG = DEBUG; - System.out.printf("%s%n", ref.getLocus()); - System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals)); - System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals)); } + // for R debugging + //if ( DEBUG ) { + // wilcoxon.DEBUG = DEBUG; + // System.out.printf("%s%n", ref.getLocus()); + // System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals)); + // System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals)); + //} + // we are testing these set1 (the alt bases) have lower quality scores than set2 (the ref bases) - double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SMALLER_SET_LT); - if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 ) - return null; + double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SET1_LT_SET2); + //System.out.println("p = " + pvalue); + //System.out.println(); + if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 ) { + pvalue = 1.0; + } // deal with precision issues - if ( pvalue < minPValue ) + if ( pvalue < minPValue ) { pvalue = minPValue; + } - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue))); + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); return map; } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8cfbf6dd4..79d457c82 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -325,6 +325,22 @@ public class MathUtils { return Math.sqrt(rms); } + /** + * calculate the Root Mean Square of an array of doubles + * @param x a double[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(Double[] x) { + if ( x.length == 0 ) + return 0.0; + + double rms = 0.0; + for (Double i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + /** * normalizes the log10-based array * diff --git a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java index b9ea9d0bc..bb925104c 100755 --- a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java +++ b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java @@ -92,8 +92,8 @@ public class WilcoxonRankSum { // just used to convert a z-score into a cumulative probability public boolean DEBUG = false; - private static final double NORMAL_MEAN = 0; - private static final double NORMAL_SD = 1; + private static final double NORMAL_MEAN = 0.0; + private static final double NORMAL_SD = 3.0; // Need to spread out the standard normal in order to provide better resolution private static final Normal NORMAL = new Normal(NORMAL_MEAN, NORMAL_SD, null); // The minimum length for both data series (individually) in order to use a normal distribution @@ -151,11 +151,11 @@ public class WilcoxonRankSum { double pvalue; // if we don't have enough data points, quit - if ( n1 < minimumNormalN || n2 < minimumNormalN ) { - pvalue = exactCalculation(h0, n1, n2, U1, U2); - } else { +// if ( n1 < minimumNormalN || n2 < minimumNormalN ) { +// pvalue = exactCalculation(h0, n1, n2, U1, U2); +// } else { pvalue = normalApproximation(h0, n1, n2, U1, U2); - } +// } if ( DEBUG && (n1 < minimumNormalN || n2 < minimumNormalN) ) { //for (int i = 0; i < observations.size(); i++) diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 675cef5ff..6d8bf11de 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -81,6 +81,14 @@ public class Haplotype { return s; } + public String toString() { + String returnString = ""; + for(int iii = 0; iii < bases.length; iii++) { + returnString += (char) bases[iii]; + } + return returnString; + } + public double[] getQuals() { return quals; } diff --git a/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 5d563c84c..f8b1d9dfa 100755 --- a/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -18,7 +18,11 @@ import java.util.Arrays; */ public class PileupElement { public static final byte DELETION_BASE = BaseUtils.D; - public static final byte DELETION_QUAL = 0; + public static final byte INSERTION_BASE_A = 87; + public static final byte INSERTION_BASE_C = 88; + public static final byte INSERTION_BASE_T = 89; + public static final byte INSERTION_BASE_G = 90; + public static final byte DELETION_QUAL = 18; protected SAMRecord read; protected int offset; diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 92c59ca2f..63ee5882e 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -422,7 +422,7 @@ public class AlignmentUtils { switch( ce.getOperator() ) { case I: case S: - for ( int jjj = 0 ; jjj < elementLength; jjj++ ) { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { alignment[alignPos++] = '+'; } break; @@ -431,7 +431,7 @@ public class AlignmentUtils { refPos++; break; case M: - for ( int jjj = 0 ; jjj < elementLength; jjj++ ) { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { alignment[alignPos] = ref[refPos]; alignPos++; refPos++; @@ -447,6 +447,147 @@ public class AlignmentUtils { return alignment; } + public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) { + + boolean atDeletion = false; + if(pileupOffset == -1) { + atDeletion = true; + pileupOffset = refLocus - alignmentStart; + final CigarElement ce = cigar.getCigarElement(0); + if( ce.getOperator() == CigarOperator.S ) { + pileupOffset += ce.getLength(); + } + } + int pos = 0; + int alignmentPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if( pos == pileupOffset ) { + return alignmentPos; + } + pos++; + } + break; + case D: + case N: + if(!atDeletion) { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignmentPos++; + } + } else { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if( pos == pileupOffset ) { + return alignmentPos; + } + pos++; + alignmentPos++; + } + + } + break; + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if( pos == pileupOffset ) { + return alignmentPos; + } + pos++; + alignmentPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignmentPos; + } + + public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) { + + int alignmentLength = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + break; + case D: + case N: + alignmentLength += elementLength; + break; + case M: + alignmentLength += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + + final byte[] alignment = new byte[alignmentLength]; + int alignPos = 0; + int readPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + if( alignPos > 0 ) { + if(alignment[alignPos-1] == (byte) 'A') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_A; + } else if(alignment[alignPos-1] == (byte) 'C') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_C; + } else if(alignment[alignPos-1] == (byte) 'T') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_T; + } else if(alignment[alignPos-1] == (byte) 'G') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_G; + } + } + case S: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + readPos++; + } + break; + case D: + case N: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = PileupElement.DELETION_BASE; + alignPos++; + } + break; + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = read[readPos]; + alignPos++; + readPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignment; + } + /** * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 72cb124ea..6545128e0 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("7ffa7d0ae9e4265eced1f793be9eba11")); + Arrays.asList("98861116ac1cdaf152adead3183664d8")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -39,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("063235b9a98137902a77b6efbaffc618")); + Arrays.asList("53d083505fc82fb388566d3856ed20ba")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a20994036d2deeeff377a89d9b93a0c6")); + Arrays.asList("c5807c5794c3a847d78e3800553d989a")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -71,7 +71,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("3677fd14ef2975afdca559a871f57a33")); + Arrays.asList("a9b2a7adee7ba8b0f5d7ff8d92e6dfbd")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -79,7 +79,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("d4d33310e0280a9329efb8c6b39f7661")); + Arrays.asList("e3c839910aa82061742e33196b112cb0")); executeTest("test overwriting header", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9380bb06f..dba3eb8fe 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -25,7 +25,7 @@ public class public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e90af2265bdbfc1c336c7e1484b86a4a")); + Arrays.asList("ae901e034b00aef16d36295821b3ea63")); executeTest("testMultiSamplePilot1", spec); } @@ -33,7 +33,7 @@ public class public void testMultiSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList("9ef1405f3ddf4a9894d12718cc6041a1")); + Arrays.asList("2ad026dee3fe592c124eb8724a843a5e")); executeTest("testMultiSamplePilot2", spec); } @@ -41,7 +41,7 @@ public class public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("88a095d59e3210955dd066e54cfff6cd")); + Arrays.asList("bc573090407ae9ec4401757eaa03de20")); executeTest("testSingleSamplePilot2", spec); } @@ -51,7 +51,7 @@ public class // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "f213174bc3f6890dbe72628763be75d0"; + private final static String COMPRESSED_OUTPUT_MD5 = "1889ecc5aa1b23b2e77b1bd192577f1a"; @Test public void testCompressedOutput() { @@ -78,7 +78,7 @@ public class @Test public void testParallelization() { - String md5 = "4c88572ec014cd0b256b76cb5fac41df"; + String md5 = "d19745ab31f903de8d5a8e853b4e52dd"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -105,12 +105,12 @@ public class @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "9d24c57250ec66905a157975c27f7094" ); - e.put( "-all_bases", "6bd860e4de6a4f013693a49556ccfd02" ); - e.put( "--min_base_quality_score 26", "94de36ab7021e767f14903b7fd0cf80e" ); - e.put( "--min_mapping_quality_score 26", "a86e9cdc629f0957658f8d570014f45b" ); - e.put( "--max_mismatches_in_40bp_window 5", "4cf60eeff7f25d8e778c72deb7e14cc2" ); - e.put( "--p_nonref_model GRID_SEARCH", "eda1afbdb42c9c5d6fc07a321020071a" ); + e.put( "-genotype", "37d3954e19309a24c386758afad93252" ); + e.put( "-all_bases", "04568093c5dc70fa7965b4ab15fd0f7e" ); + e.put( "--min_base_quality_score 26", "5d1886a9637183707645bc2dc6bf8282" ); + e.put( "--min_mapping_quality_score 26", "78423524cf56cce1d0847847d542459f" ); + e.put( "--max_mismatches_in_40bp_window 5", "2963c771aafe84b62082f475d20bad5e" ); + e.put( "--p_nonref_model GRID_SEARCH", "c254a4e593b4ffb112299be874503ce0" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -124,12 +124,12 @@ public class public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("eda1afbdb42c9c5d6fc07a321020071a")); + Arrays.asList("c254a4e593b4ffb112299be874503ce0")); executeTest("testConfidence1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("8daa14278976555e64c582c4e44b9b8e")); + Arrays.asList("d2323a0234d27257393f4931fca70dbc")); executeTest("testConfidence2", spec2); } @@ -141,8 +141,8 @@ public class @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "3679786112b414546a464c94c900174e" ); - e.put( 1.0 / 1850, "efa1cb09fa72dd4bd6dbdf6c0fa0f038" ); + e.put( 0.01, "96e85b26cf5f0a523b4b0886dbb902b1" ); + e.put( 1.0 / 1850, "5ffecb68b52169ded04312aa5dcdc137" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -165,7 +165,7 @@ public class " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("037e7c0d56e88b4d85f326bf27ad9f1c")); + Arrays.asList("e65c95a9d2a0995078d3e6835cf4ee61")); executeTest(String.format("testMultiTechnologies"), spec); }