HaplotypeScore is revamped. It now uses reads' Cigar strings when building the haplotype blocks to skip over soft-clipped bases and factor in insertions and deletions. The statistic now uses only the reads from the filtered context to build the haplotypes but it scores all reads against the two best haplotypes. The score is now computed individually for each sample's reads and then averaged together. Bug fixes throughout. The math for the base quality and mapping quality rank sum tests is fixed. The annotations remain as ExperimentalAnnotations pending more investigation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4934 348d0f76-0448-11de-a6fe-93d51630548a
2011-01-05 00:28:05 +00:00 · 2011-01-05 00:28:05 +00:00 · 23dbc5ccf3
parent 85714621be
commit 23dbc5ccf3
11 changed files with 362 additions and 200 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
@ -9,8 +9,7 @@ import java.util.List;
 import java.util.Arrays;


-public class BaseQualityRankSumTest /*extends RankSumTest*/ {
-    // todo -- seems math in this test is dubious, need to recheck and verify (p-values wildly divergent from R or MATLAB)
+public class BaseQualityRankSumTest extends RankSumTest {
    public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }

    public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Phred-scaled p-value From Wilcoxon Rank Sum Test of Het Vs. Ref Base Qualities")); }
@ -21,10 +20,11 @@ public class BaseQualityRankSumTest /*extends RankSumTest*/ {
            if ( p.isDeletion() )
                continue;

-            if ( p.getBase() == ref )
+            if ( p.getBase() == ref ) {
                refQuals.add((int)p.getQual());
-            else if ( (char)p.getBase() == alt )
+            } else if ( (char)p.getBase() == alt ) {
                altQuals.add((int)p.getQual());
+            }
        }
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
@ -24,6 +24,7 @@

 package org.broadinstitute.sting.gatk.walkers.annotator;

+import org.broad.tribble.util.variantcontext.Genotype;
 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broad.tribble.vcf.VCFHeaderLineType;
 import org.broad.tribble.vcf.VCFInfoHeaderLine;
@ -39,49 +40,51 @@ import org.broadinstitute.sting.utils.pileup.*;

 import java.util.*;
 import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;

 public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
    private final static boolean DEBUG = false;
-    private final static int MIN_CONTEXT_WING_SIZE = 10;
-    private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 20;
+    private final static int MIN_CONTEXT_WING_SIZE = 20;
+    private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50;
    private final static char REGEXP_WILDCARD = '.';

    public boolean useRead(PileupElement p) {
-        return ! ReadUtils.is454Read(p.getRead());
-	// TODO -- warning, min. mapping quality fixed to constant 1 here for indels.  How can I determine the min. mapping quality?
-        // if ( ReadUtils.is454Read(p.getRead()) ) // we're not a 454 read
-	//     return false;
-	// else if ( p.getOffset() == -1 ) // indel
-	//     return p.getRead().getMappingQuality() > 0;
-	// else if ( ! (p.getRead() instanceof GATKSAMRecord) ) // we're not a GATKSamRecord, so we can't filter
-	//     return true;
-	// else 
-	//     return ((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset()); // we used the base to call the variant
+	    //return true; // Use all reads
+        //return !ReadUtils.is454Read(p.getRead()); // Use all non-454 reads
+        return p.getOffset() == -1 || ((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset()); // Use all reads from the filtered context
    }

    public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
        if ( !vc.isBiallelic() || !vc.isSNP() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here
            return null;
+        
+        final AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values());

-        AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values());
+        final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE);
+        final int contextSize = contextWingSize * 2 + 1;

-        int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE);
-        int contextSize = contextWingSize * 2 + 1;
+        final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2;

        // Compute all haplotypes consistent with the current read pileup
-        List<Haplotype> haplotypes = computeHaplotypes(context.getBasePileup(), contextSize);
+        final List<Haplotype> haplotypes = computeHaplotypes(context.getBasePileup(), contextSize, locus);

-        // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
-        double score = 0.0;
+	    final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
+        if (haplotypes != null) {
+            final Set<Map.Entry<String, Genotype>> genotypes = vc.getGenotypes().entrySet();
+            for ( final Map.Entry<String, Genotype> genotype : genotypes ) {
+                final StratifiedAlignmentContext thisContext = stratifiedContexts.get(genotype.getKey());
+                if ( thisContext != null ) {
+                    double thisScore = scoreReadsAgainstHaplotypes(haplotypes, thisContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), contextSize, locus);
+                    scoreRA.add(thisScore); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
+                }
+            }
+        }

-        if (haplotypes != null)
-            score = scoreReadsAgainstHaplotypes(haplotypes, context.getBasePileup(), contextSize);
-
-        // return the score
-        Map<String, Object> map = new HashMap<String, Object>();
-        map.put(getKeyNames().get(0), String.format("%.2f", score));
+        // annotate the score in the info field
+        final Map<String, Object> map = new HashMap<String, Object>();
+        map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean()));
        return map;
    }

@ -97,102 +100,89 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
        }
    }

-    private List<Haplotype> computeHaplotypes(ReadBackedPileup pileup, int contextSize) {
+    private List<Haplotype> computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus) {
        // Compute all possible haplotypes consistent with current pileup
-        ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
-        PriorityQueue<Haplotype> haplotypeQueue = new PriorityQueue<Haplotype>(100, new HaplotypeComparator());
+        final PriorityQueue<Haplotype> candidateHaplotypeQueue = new PriorityQueue<Haplotype>(100, new HaplotypeComparator());
+        final PriorityQueue<Haplotype> consensusHaplotypeQueue = new PriorityQueue<Haplotype>(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator());

-        for ( PileupElement p : pileup ) {
+        for ( final PileupElement p : pileup ) {
            if ( useRead(p) ) {
-                Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize);
-                haplotypeQueue.add(haplotypeFromRead);
+                final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus);
+                candidateHaplotypeQueue.add(haplotypeFromRead);
            }
        }

        // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes
        Haplotype elem;
-        while ((elem = haplotypeQueue.poll()) != null)  {
-            //System.out.print("element: "+elem.toString());
-            //System.out.format(" SumQual = %f\n", elem.getQualitySum());
+        while ((elem = candidateHaplotypeQueue.poll()) != null)  {
            boolean foundHaplotypeMatch = false;
-            //Haplotype[] remainingHaplotypes = haplotypeQueue.toArray(new Haplotype[haplotypeQueue.size()]);
-            for ( Haplotype haplotypeFromList : haplotypeList ) {
-
-                Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList);
-                //System.out.format("-Checking consensus for %s:", haplotypeFromList.toString());
+            Haplotype lastCheckedHaplotype = null;
+            for ( final Haplotype haplotypeFromList : consensusHaplotypeQueue ) {
+                final Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList);
                if (consensusHaplotype != null)  {
-                    //System.out.format("--Consensus haplotype  = %s, qual = %f\n", consensusHaplotype.toString(), consensusHaplotype.getQualitySum());
                    foundHaplotypeMatch = true;
                    if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) {
-                        haplotypeList.remove(haplotypeFromList);
-                        haplotypeList.add(consensusHaplotype);
+                        consensusHaplotypeQueue.remove(haplotypeFromList);
+                        consensusHaplotypeQueue.add(consensusHaplotype);
                    }
                    break;
                }
-        /*        else {
-                    System.out.println("no consensus found");
+                else {
+                    lastCheckedHaplotype = haplotypeFromList;
                }
-          */
            }

-            if (!foundHaplotypeMatch && haplotypeList.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) {
-                haplotypeList.add(elem);
+            if (!foundHaplotypeMatch && consensusHaplotypeQueue.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) {
+                consensusHaplotypeQueue.add(elem);
+            } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum() ) {
+                consensusHaplotypeQueue.remove(lastCheckedHaplotype);
+                consensusHaplotypeQueue.add(elem);
            }
        }
-        // Now retrieve two most popular haplotypes
-        // TODO - quick and dirty solution, could use better data structures to do this automatically
-        int bestIdx=0, secondBestIdx=0;
-        double bestIdxVal=-1.0, secondBestIdxVal = -1.0;

-        for (int k=0; k < haplotypeList.size(); k++) {
-
-            double qualSum = haplotypeList.get(k).getQualitySum();
-            if (qualSum >= bestIdxVal) {
-                secondBestIdx = bestIdx;
-                secondBestIdxVal = bestIdxVal;
-                bestIdx = k;
-                bestIdxVal = qualSum;
-            }
-            else if (qualSum >= secondBestIdxVal) {
-                // check if current is second best
-                secondBestIdx = k;
-                secondBestIdxVal = qualSum;
-            }
-        }
-        if (haplotypeList.size() > 0) {
-            Haplotype haplotypeR = haplotypeList.get(bestIdx);
-            Haplotype haplotypeA = haplotypeList.get(secondBestIdx);
-    //System.out.format("%d %d\n",bestIdx, secondBestIdx);
-            // Temp hack to match old implementation's scaling, TBD better behavior
-
-            return Arrays.asList(new Haplotype(haplotypeR.getBasesAsBytes(), 60), new Haplotype(haplotypeA.getBasesAsBytes(), contextSize));
-        }
-        else
+        // Now retrieve the two most popular haplotypes
+        if (consensusHaplotypeQueue.size() > 0) {
+            // Since the consensus haplotypes are in a quality-ordered priority queue, the two best haplotypes are just the first two in the queue
+            final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
+            Haplotype haplotype2 = consensusHaplotypeQueue.poll();
+            if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found
+            return Arrays.asList(new Haplotype(haplotype1.getBasesAsBytes(), 60), new Haplotype(haplotype2.getBasesAsBytes(), 20)); // These qual values aren't used for anything
+        } else {
            return null;
+        }
    }

-    private Haplotype getHaplotypeFromRead(PileupElement p, int contextSize) {
-        SAMRecord read = p.getRead();
+    private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) {
+        final SAMRecord read = p.getRead();
        int readOffsetFromPileup = p.getOffset();
-        int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2;
-        byte[] haplotypeBases = new byte[contextSize];
+        final byte[] haplotypeBases = new byte[contextSize];

        for(int i=0; i < contextSize; i++) {
            haplotypeBases[i] = (byte)REGEXP_WILDCARD;
        }

-        double[] baseQualities = new double[contextSize];
-        Arrays.fill(baseQualities,0.0);
+        final double[] baseQualities = new double[contextSize];
+        Arrays.fill(baseQualities, 0.0);
+
+        byte[] readBases = read.getReadBases();
+        readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
+        byte[] readQuals = read.getBaseQualities();
+        readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string
+
+        readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus);
+        final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2;

-        final byte[] readBases = read.getReadBases();
-        final byte[] readQuals = read.getBaseQualities();
        for (int i = 0; i < contextSize; i++ ) {
-            int baseOffset = i + baseOffsetStart;
-            if ( baseOffset < 0 )
+            final int baseOffset = i + baseOffsetStart;
+            if ( baseOffset < 0 ) {
                continue;
-            if ( baseOffset >= readBases.length )
+            }
+            if ( baseOffset >= readBases.length ) {
                break;
-
+            }
+            if( readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; }
+            if( readBases[baseOffset] == BaseUtils.N ) { readBases[baseOffset] = (byte)REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 4; } // N's shouldn't be treated as distinct bases
+            if( ((double)readQuals[baseOffset]) < 4.0 ) { readQuals[baseOffset] = (byte) 4; } // quals less than 4 are used as codes and don't have actual probabilistic meaning behind them
            haplotypeBases[i] = readBases[baseOffset];
            baseQualities[i] = (double)readQuals[baseOffset];
        }
@ -200,19 +190,20 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
        return new Haplotype(haplotypeBases, baseQualities);
    }

-    private Haplotype getConsensusHaplotype(Haplotype haplotypeA, Haplotype haplotypeB) {
+    private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplotype haplotypeB) {
        final byte[] a = haplotypeA.getBasesAsBytes();
        final byte[] b = haplotypeB.getBasesAsBytes();

-        if (a.length != b.length)
+        if (a.length != b.length) {
            throw new ReviewedStingException("Haplotypes a and b must be of same length");
+        }

        byte chA, chB;
-        byte wc = (byte)REGEXP_WILDCARD;
+        final byte wc = (byte)REGEXP_WILDCARD;

        final int length = a.length;
-        byte[] consensusChars = new byte[length];
-        double[] consensusQuals = new double[length];
+        final byte[] consensusChars = new byte[length];
+        final double[] consensusQuals = new double[length];

        final double[] qualsA = haplotypeA.getQuals();
        final double[] qualsB = haplotypeB.getQuals();
@ -243,45 +234,34 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {

        return new Haplotype(consensusChars, consensusQuals);
    }
+
    // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
-    private double scoreReadsAgainstHaplotypes(List<Haplotype> haplotypes, ReadBackedPileup pileup, int contextSize) {
+    private double scoreReadsAgainstHaplotypes(final List<Haplotype> haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) {
 //        if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0));
-//        if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1));
+//        if ( DEBUG ) System.out.printf("HAP2: %s%n", haplotypes.get(1));

-        double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()];
-        int pileupOffset = 0;
-        for ( PileupElement p : pileup ) {
-            if ( useRead(p) ) {
-                SAMRecord read = p.getRead();
-                int readOffsetFromPileup = p.getOffset();
-
-                if (ReadUtils.is454Read(read))
-                    continue;
-
-                if ( DEBUG ) System.out.printf("--------------------------------------------- Read %s%n", read.getReadName());
-                double m = 10000000;
-                for ( int i = 0; i < haplotypes.size(); i++ ) {
-                    Haplotype haplotype = haplotypes.get(i);
-                    int start = readOffsetFromPileup - (contextSize - 1)/2;
-                    double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype);
-                    haplotypeScores[pileupOffset][i] = score;
-                    if ( DEBUG ) System.out.printf("  vs. haplotype %d = %f%n", i, score);
-                    m = Math.min(score, m);
-                }
-                if ( DEBUG ) System.out.printf("  => best score was %f%n", m);
+        final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();
+        for ( final PileupElement p : pileup ) {
+            // Score all the reads in the pileup, even the filtered ones
+            final double[] scores = new double[haplotypes.size()];
+            for ( int i = 0; i < haplotypes.size(); i++ ) {
+                final Haplotype haplotype = haplotypes.get(i);
+                final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus);
+                scores[i] = score;
+                if ( DEBUG ) { System.out.printf("  vs. haplotype %d = %f%n", i, score); }
            }
-            pileupOffset++; // todo -- remove me
+            haplotypeScores.add(scores);
        }

        double overallScore = 0.0;
-        for ( double[] readHaplotypeScores : haplotypeScores ) {
+        for ( final double[] readHaplotypeScores : haplotypeScores ) {
            overallScore += MathUtils.arrayMin(readHaplotypeScores);
        }

        return overallScore;
    }

-    private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Haplotype haplotype ) {
+    private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus ) {
        double expected = 0.0;
        double mismatches = 0.0;

@ -296,41 +276,49 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
        // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch.
        // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n
        final byte[] haplotypeBases = haplotype.getBasesAsBytes();
-        final byte[] readBases = read.getReadBases();
-        final byte[] readQuals = read.getBaseQualities();
+        final SAMRecord read = p.getRead();
+        byte[] readBases = read.getReadBases();
+
+        readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
+        byte[] readQuals = read.getBaseQualities();
+        readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string
+        int readOffsetFromPileup = p.getOffset();
+        readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus);
+        final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2;
+
        for ( int i = 0; i < contextSize; i++ ) {
-            int baseOffset = i + baseOffsetStart;
-            if ( baseOffset < 0 )
+            final int baseOffset = i + baseOffsetStart;
+            if ( baseOffset < 0 ) {
                continue;
-            if ( baseOffset >= readBases.length )
+            }
+            if ( baseOffset >= readBases.length ) {
                break;
+            }

-            byte haplotypeBase = haplotypeBases[i];
-            byte readBase = readBases[baseOffset];
+            final byte haplotypeBase = haplotypeBases[i];
+            final byte readBase = readBases[baseOffset];

-            boolean matched = readBase == haplotypeBase;
-            double e = QualityUtils.qualToErrorProb(readQuals[baseOffset]);
+
+            final boolean matched = (readBase == haplotypeBase); // Purposefully counting wildcards in the chosen haplotype as a mismatch
+            byte qual = readQuals[baseOffset];
+            if( qual == PileupElement.DELETION_BASE ) { qual = PileupElement.DELETION_QUAL; }
+            if( ((double) qual) < 4.0 ) { qual = (byte) 4; } // quals less than 4 are used as codes and don't have actual probabilistic meaning behind them
+            final double e = QualityUtils.qualToErrorProb(qual);
            expected += e;
-            mismatches += matched ? e : 1 - e / 3;
+            mismatches += matched ? e : 1.0 - e / 3.0;

            // a more sophisticated calculation would include the reference quality, but it's nice to actually penalize
            // the mismatching of poorly determined regions of the consensus

-            if ( DEBUG ) System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n",
-                    read.getReadName(), (char)haplotypeBase, (char)readBase, e, readQuals[baseOffset], expected, mismatches);
+//            if ( DEBUG ) {
+//                System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n",
+//                    read.getReadName(), (char)haplotypeBase, (char)readBase, e, readQuals[baseOffset], expected, mismatches);
+//            }
        }

        return mismatches - expected;
    }

-
-    private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length];
-    static {
-        for ( int i = 0; i < BaseUtils.Base.values().length; i++ )
-            FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length);
-    }
-
-
    public List<String> getKeyNames() { return Arrays.asList("HaplotypeScore"); }
    public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with two (and only two) segregating haplotypes")); }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
@ -9,7 +9,7 @@ import java.util.List;
 import java.util.Arrays;


-public class MappingQualityRankSumTest /*extends RankSumTest*/ {
+public class MappingQualityRankSumTest extends RankSumTest {

    public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }

@ -21,10 +21,11 @@ public class MappingQualityRankSumTest /*extends RankSumTest*/ {
            if ( p.isDeletion() )
                continue;

-            if ( p.getBase() == ref )
+            if ( p.getBase() == ref ) {
                refQuals.add(p.getMappingQual());
-            else if ( (char)p.getBase() == alt )
+            } else if ( (char)p.getBase() == alt ) {
                altQuals.add(p.getMappingQual());
+            }
        }
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@ -15,9 +15,9 @@ import java.util.Map;
 import java.util.HashMap;


-public abstract class RankSumTest implements InfoFieldAnnotation, WorkInProgressAnnotation {
+public abstract class RankSumTest implements InfoFieldAnnotation, ExperimentalAnnotation {
    private final static boolean DEBUG = false;
-    private static final double minPValue = 1e-10;
+    private static final double minPValue = 1e-20;

    public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
        if ( stratifiedContexts.size() == 0 )
@ -30,45 +30,49 @@ public abstract class RankSumTest implements InfoFieldAnnotation, WorkInProgress
        if ( genotypes == null || genotypes.size() == 0 )
            return null;

-        ArrayList<Integer> refQuals = new ArrayList<Integer>();
-        ArrayList<Integer> altQuals = new ArrayList<Integer>();
+        final ArrayList<Integer> refQuals = new ArrayList<Integer>();
+        final ArrayList<Integer> altQuals = new ArrayList<Integer>();

-        for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
-            // we care only about het calls
-            if ( genotype.getValue().isHet() ) {
-                StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
+        for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
+            if ( !genotype.getValue().isHomRef() ) {
+                final StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
                if ( context == null )
                    continue;

                fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).toString().charAt(0), context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), refQuals, altQuals);
            }
        }
-
-        WilcoxonRankSum wilcoxon = new WilcoxonRankSum();
-        for ( Integer qual : altQuals )
+        final WilcoxonRankSum wilcoxon = new WilcoxonRankSum();
+        for ( final Integer qual : altQuals ) {
            wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET1);
-        for ( Integer qual : refQuals )
+        }
+        for ( final Integer qual : refQuals ) {
            wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2);
-
-        // for R debugging
-        if ( DEBUG ) {
-            wilcoxon.DEBUG = DEBUG;
-            System.out.printf("%s%n", ref.getLocus());
-            System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals));
-            System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals));
        }

+        // for R debugging
+        //if ( DEBUG ) {
+        //    wilcoxon.DEBUG = DEBUG;
+        //    System.out.printf("%s%n", ref.getLocus());
+        //    System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals));
+        //    System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals));
+        //}
+
        // we are testing these set1 (the alt bases) have lower quality scores than set2 (the ref bases)
-        double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SMALLER_SET_LT);
-        if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 )
-            return null;
+        double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SET1_LT_SET2);
+        //System.out.println("p = " + pvalue);
+        //System.out.println();
+        if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 ) {
+            pvalue = 1.0;
+        }

        // deal with precision issues
-        if ( pvalue < minPValue )
+        if ( pvalue < minPValue ) {
            pvalue = minPValue;
+        }

-        Map<String, Object> map = new HashMap<String, Object>();
-        map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue)));
+        final Map<String, Object> map = new HashMap<String, Object>();
+        map.put(getKeyNames().get(0), String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
        return map;
    }

--- a/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java
@ -325,6 +325,22 @@ public class MathUtils {
        return Math.sqrt(rms);
    }

+    /**
+     * calculate the Root Mean Square of an array of doubles
+     * @param x  a double[] of numbers
+     * @return   the RMS of the specified numbers.
+    */
+    public static double rms(Double[] x) {
+        if ( x.length == 0 )
+            return 0.0;
+
+        double rms = 0.0;
+        for (Double i : x)
+            rms += i * i;
+        rms /= x.length;
+        return Math.sqrt(rms);
+    }
+
    /**
     * normalizes the log10-based array
     *
--- a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java
+++ b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java
@ -92,8 +92,8 @@ public class WilcoxonRankSum {
    // just used to convert a z-score into a cumulative probability
    public boolean DEBUG = false;

-    private static final double NORMAL_MEAN = 0;
-    private static final double NORMAL_SD   = 1;
+    private static final double NORMAL_MEAN = 0.0;
+    private static final double NORMAL_SD   = 3.0; // Need to spread out the standard normal in order to provide better resolution
    private static final Normal NORMAL = new Normal(NORMAL_MEAN, NORMAL_SD, null);

    // The minimum length for both data series (individually) in order to use a normal distribution
@ -151,11 +151,11 @@ public class WilcoxonRankSum {

        double pvalue;
        // if we don't have enough data points, quit
-        if ( n1 < minimumNormalN || n2 < minimumNormalN ) {
-            pvalue = exactCalculation(h0, n1, n2, U1, U2);
-        } else {
+//        if ( n1 < minimumNormalN || n2 < minimumNormalN ) {
+//            pvalue = exactCalculation(h0, n1, n2, U1, U2);
+//        } else {
            pvalue = normalApproximation(h0, n1, n2, U1, U2);
-        }
+//       }

        if ( DEBUG && (n1 < minimumNormalN || n2 < minimumNormalN) ) {
            //for (int i = 0; i < observations.size(); i++)
--- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
@ -81,6 +81,14 @@ public class Haplotype {
        return s;
    }

+    public String toString() {
+        String returnString = "";
+        for(int iii = 0; iii < bases.length; iii++) {
+            returnString += (char) bases[iii];
+        }
+        return returnString;
+    }
+
    public double[] getQuals() {
        return quals;
    }
--- a/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
+++ b/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
@ -18,7 +18,11 @@ import java.util.Arrays;
 */
 public class PileupElement {
    public static final byte DELETION_BASE = BaseUtils.D;
-    public static final byte DELETION_QUAL = 0;
+    public static final byte INSERTION_BASE_A = 87;
+    public static final byte INSERTION_BASE_C = 88;
+    public static final byte INSERTION_BASE_T = 89;
+    public static final byte INSERTION_BASE_G = 90;
+    public static final byte DELETION_QUAL = 18;

    protected SAMRecord read;
    protected int offset;
--- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@ -422,7 +422,7 @@ public class AlignmentUtils {
            switch( ce.getOperator() ) {
            case I:
            case S:
-                for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
                    alignment[alignPos++] = '+';
                }
                break;
@ -431,7 +431,7 @@ public class AlignmentUtils {
                refPos++;
                break;
            case M:
-                for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
                    alignment[alignPos] = ref[refPos];
                    alignPos++;
                    refPos++;
@ -447,6 +447,147 @@ public class AlignmentUtils {
        return alignment;
    }

+    public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) {
+
+        boolean atDeletion = false;
+        if(pileupOffset == -1) {
+            atDeletion = true;
+            pileupOffset = refLocus - alignmentStart;
+            final CigarElement ce = cigar.getCigarElement(0);
+            if( ce.getOperator() == CigarOperator.S ) {
+                pileupOffset += ce.getLength();
+            }
+        }
+       int pos = 0;
+       int alignmentPos = 0;
+       for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+            final CigarElement ce = cigar.getCigarElement(iii);
+            final int elementLength = ce.getLength();
+
+            switch( ce.getOperator() ) {
+            case I:
+            case S:
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                    if( pos == pileupOffset ) {
+                        return alignmentPos;
+                    }
+                    pos++;
+                }
+                break;
+            case D:
+            case N:
+                if(!atDeletion) {
+                    for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                        alignmentPos++;
+                    }
+                } else {
+                    for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                        if( pos == pileupOffset ) {
+                            return alignmentPos;
+                        }
+                        pos++;
+                        alignmentPos++;
+                    }
+
+                }
+                break;
+            case M:
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                    if( pos == pileupOffset ) {
+                        return alignmentPos;
+                    }
+                    pos++;
+                    alignmentPos++;
+                }
+                break;
+            case H:
+            case P:
+                break;
+            default:
+                throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+            }
+        }
+        return alignmentPos;
+    }
+
+   public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) {
+
+       int alignmentLength = 0;
+       for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+            final CigarElement ce = cigar.getCigarElement(iii);
+            final int elementLength = ce.getLength();
+
+            switch( ce.getOperator() ) {
+            case I:
+            case S:
+                break;
+            case D:
+            case N:
+                alignmentLength += elementLength;
+                break;
+            case M:
+                alignmentLength += elementLength;
+                break;
+            case H:
+            case P:
+                break;
+            default:
+                throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+            }
+        }
+
+        final byte[] alignment = new byte[alignmentLength];
+        int alignPos = 0;
+        int readPos = 0;
+        for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+            final CigarElement ce = cigar.getCigarElement(iii);
+            final int elementLength = ce.getLength();
+
+            switch( ce.getOperator() ) {
+            case I:                
+                if( alignPos > 0 ) {
+                    if(alignment[alignPos-1] == (byte) 'A') {
+                        alignment[alignPos-1] = PileupElement.INSERTION_BASE_A;
+                    } else if(alignment[alignPos-1] == (byte) 'C') {
+                        alignment[alignPos-1] = PileupElement.INSERTION_BASE_C;
+                    } else if(alignment[alignPos-1] == (byte) 'T') {
+                        alignment[alignPos-1] = PileupElement.INSERTION_BASE_T;
+                    } else if(alignment[alignPos-1] == (byte) 'G') {
+                        alignment[alignPos-1] = PileupElement.INSERTION_BASE_G;
+                    }
+                }
+            case S:
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                    readPos++;
+                }
+                break;
+            case D:
+            case N:
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                    alignment[alignPos] = PileupElement.DELETION_BASE;
+                    alignPos++;
+                }
+                break;
+            case M:
+                for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+                    alignment[alignPos] = read[readPos];
+                    alignPos++;
+                    readPos++;
+                }
+                break;
+            case H:
+            case P:
+                break;
+            default:
+                throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+            }
+        }
+        return alignment;
+    }
+
    /**
     * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
     * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
--- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
    public void testHasAnnotsAsking1() {
        WalkerTestSpec spec = new WalkerTestSpec(
                baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
-                Arrays.asList("7ffa7d0ae9e4265eced1f793be9eba11"));
+                Arrays.asList("98861116ac1cdaf152adead3183664d8"));
        executeTest("test file has annotations, asking for annotations, #1", spec);
    }

@ -39,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
    public void testHasAnnotsAsking2() {
        WalkerTestSpec spec = new WalkerTestSpec(
                baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
-                Arrays.asList("063235b9a98137902a77b6efbaffc618"));
+                Arrays.asList("53d083505fc82fb388566d3856ed20ba"));
        executeTest("test file has annotations, asking for annotations, #2", spec);
    }

@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
    public void testNoAnnotsAsking1() {
        WalkerTestSpec spec = new WalkerTestSpec(
                baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
-                Arrays.asList("a20994036d2deeeff377a89d9b93a0c6"));
+                Arrays.asList("c5807c5794c3a847d78e3800553d989a"));
        executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
    }

@ -71,7 +71,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
    public void testNoAnnotsAsking2() {
        WalkerTestSpec spec = new WalkerTestSpec(
                baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
-                Arrays.asList("3677fd14ef2975afdca559a871f57a33"));
+                Arrays.asList("a9b2a7adee7ba8b0f5d7ff8d92e6dfbd"));
        executeTest("test file doesn't have annotations, asking for annotations, #2", spec);
    }

@ -79,7 +79,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
    public void testOverwritingHeader() {
        WalkerTestSpec spec = new WalkerTestSpec(
                baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1,
-                Arrays.asList("d4d33310e0280a9329efb8c6b39f7661"));
+                Arrays.asList("e3c839910aa82061742e33196b112cb0"));
        executeTest("test overwriting header", spec);
    }

--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -25,7 +25,7 @@ public class
    public void testMultiSamplePilot1() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
-                Arrays.asList("e90af2265bdbfc1c336c7e1484b86a4a"));
+                Arrays.asList("ae901e034b00aef16d36295821b3ea63"));
        executeTest("testMultiSamplePilot1", spec);
    }

@ -33,7 +33,7 @@ public class
    public void testMultiSamplePilot2() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1,
-                Arrays.asList("9ef1405f3ddf4a9894d12718cc6041a1"));
+                Arrays.asList("2ad026dee3fe592c124eb8724a843a5e"));
        executeTest("testMultiSamplePilot2", spec);
    }

@ -41,7 +41,7 @@ public class
    public void testSingleSamplePilot2() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
-                Arrays.asList("88a095d59e3210955dd066e54cfff6cd"));
+                Arrays.asList("bc573090407ae9ec4401757eaa03de20"));
        executeTest("testSingleSamplePilot2", spec);
    }

@ -51,7 +51,7 @@ public class
    //
    // --------------------------------------------------------------------------------------------------------------

-    private final static String COMPRESSED_OUTPUT_MD5 = "f213174bc3f6890dbe72628763be75d0";
+    private final static String COMPRESSED_OUTPUT_MD5 = "1889ecc5aa1b23b2e77b1bd192577f1a";

    @Test
    public void testCompressedOutput() {
@ -78,7 +78,7 @@ public class

    @Test
    public void testParallelization() {
-        String md5 = "4c88572ec014cd0b256b76cb5fac41df";
+        String md5 = "d19745ab31f903de8d5a8e853b4e52dd";

        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -105,12 +105,12 @@ public class
    @Test
    public void testParameter() {
        HashMap<String, String> e = new HashMap<String, String>();
-        e.put( "-genotype", "9d24c57250ec66905a157975c27f7094" );
-        e.put( "-all_bases", "6bd860e4de6a4f013693a49556ccfd02" );
-        e.put( "--min_base_quality_score 26", "94de36ab7021e767f14903b7fd0cf80e" );
-        e.put( "--min_mapping_quality_score 26", "a86e9cdc629f0957658f8d570014f45b" );
-        e.put( "--max_mismatches_in_40bp_window 5", "4cf60eeff7f25d8e778c72deb7e14cc2" );
-        e.put( "--p_nonref_model GRID_SEARCH", "eda1afbdb42c9c5d6fc07a321020071a" );
+        e.put( "-genotype", "37d3954e19309a24c386758afad93252" );
+        e.put( "-all_bases", "04568093c5dc70fa7965b4ab15fd0f7e" );
+        e.put( "--min_base_quality_score 26", "5d1886a9637183707645bc2dc6bf8282" );
+        e.put( "--min_mapping_quality_score 26", "78423524cf56cce1d0847847d542459f" );
+        e.put( "--max_mismatches_in_40bp_window 5", "2963c771aafe84b62082f475d20bad5e" );
+        e.put( "--p_nonref_model GRID_SEARCH", "c254a4e593b4ffb112299be874503ce0" );

        for ( Map.Entry<String, String> entry : e.entrySet() ) {
            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -124,12 +124,12 @@ public class
    public void testConfidence() {
        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
-                Arrays.asList("eda1afbdb42c9c5d6fc07a321020071a"));
+                Arrays.asList("c254a4e593b4ffb112299be874503ce0"));
        executeTest("testConfidence1", spec1);

        WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
-                Arrays.asList("8daa14278976555e64c582c4e44b9b8e"));
+                Arrays.asList("d2323a0234d27257393f4931fca70dbc"));
        executeTest("testConfidence2", spec2);
    }

@ -141,8 +141,8 @@ public class
    @Test
    public void testHeterozyosity() {
        HashMap<Double, String> e = new HashMap<Double, String>();
-        e.put( 0.01, "3679786112b414546a464c94c900174e" );
-        e.put( 1.0 / 1850, "efa1cb09fa72dd4bd6dbdf6c0fa0f038" );
+        e.put( 0.01, "96e85b26cf5f0a523b4b0886dbb902b1" );
+        e.put( 1.0 / 1850, "5ffecb68b52169ded04312aa5dcdc137" );

        for ( Map.Entry<Double, String> entry : e.entrySet() ) {
            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -165,7 +165,7 @@ public class
                        " -o %s" +
                        " -L 1:10,000,000-10,100,000",
                1,
-                Arrays.asList("037e7c0d56e88b4d85f326bf27ad9f1c"));
+                Arrays.asList("e65c95a9d2a0995078d3e6835cf4ee61"));

        executeTest(String.format("testMultiTechnologies"), spec);
    }