Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2012-04-09 14:53:19 -04:00 · 2012-04-09 14:53:19 -04:00 · 719ec9144a
parent 550179a1f7 d312fcdae8
commit 719ec9144a
26 changed files with 1893 additions and 1545 deletions
--- a/build.xml
+++ b/build.xml
@ -955,8 +955,8 @@
                <jvmarg value="-Dpipeline.run=${pipeline.run}" />
                <jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
                <jvmarg line="${cofoja.jvm.args}"/>
-                <!-- <jvmarg value="-Xdebug"/> --> 
-                <!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5678"/> -->  
+<!--                 <jvmarg value="-Xdebug"/>  -->
+<!--                 <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/>   -->

                <classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
                <classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;

 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.picard.reference.ReferenceSequenceFile;
-import net.sf.samtools.*;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMSequenceDictionary;
 import org.apache.log4j.Logger;
 import org.broad.tribble.Feature;
 import org.broadinstitute.sting.commandline.*;
@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
 import org.broadinstitute.sting.gatk.datasources.reads.*;
 import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
-import org.broadinstitute.sting.gatk.samples.SampleDB;
 import org.broadinstitute.sting.gatk.executive.MicroScheduler;
 import org.broadinstitute.sting.gatk.filters.FilterManager;
 import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
 import org.broadinstitute.sting.gatk.io.stubs.Stub;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
 import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
+import org.broadinstitute.sting.gatk.samples.SampleDB;
 import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
 import org.broadinstitute.sting.gatk.walkers.*;
 import org.broadinstitute.sting.utils.*;
@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
    private BaseRecalibration baseRecalibration = null;
    public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
    public boolean hasBaseRecalibration() { return baseRecalibration != null; }
-    public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
+    public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }

    /**
     * Actually run the GATK with the specified walker.
@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {

        // if the use specified an input BQSR recalibration table then enable on the fly recalibration
        if (this.getArguments().BQSR_RECAL_FILE != null)
-            setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
+            setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);

        // Determine how the threads should be divided between CPU vs. IO.
        determineThreadAllocation();
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@ -193,6 +193,16 @@ public class GATKArgumentCollection {
    @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
    public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously

+    /**
+     * Turns on the base quantization module. It requires a recalibration report (-BQSR).
+     *
+     * A value of 0 here means "do not quantize".
+     * Any value greater than zero will be used to recalculate the quantization using this many levels.
+     * Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
+     */
+    @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
+    public int quantizationLevels = -1;
+
    @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
    public byte defaultBaseQualities = -1;

--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
@ -191,6 +191,16 @@ public class ReferenceContext {
        return basesCache;
    }

+    /**
+     * All the bases in the window from the current base forward to the end of the window.
+     */
+    public byte[] getForwardBases() {
+        final byte[] bases = getBases();
+        final int mid = locus.getStart() - window.getStart();
+        // todo -- warning of performance problem, especially if this is called over and over
+        return new String(bases).substring(mid).getBytes();
+    }
+
    @Deprecated
    public char getBaseAsChar() {
        return (char)getBase();
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java
@ -19,13 +19,19 @@ import java.util.Map;
 public class QuantizationInfo {
    private List<Byte> quantizedQuals;
    private List<Long> empiricalQualCounts;
+    int quantizationLevels;

-    public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
+    public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
        this.quantizedQuals = quantizedQuals;
        this.empiricalQualCounts = empiricalQualCounts;
+        this.quantizationLevels = quantizationLevels;
+    }
+
+    public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
+        this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
    }
    
-    public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
+    public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
        final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1];                                          // create a histogram with the empirical quality distribution
        for (int i = 0; i < qualHistogram.length; i++)
            qualHistogram[i] = 0L;
@ -46,7 +52,9 @@ public class QuantizationInfo {
            qualHistogram[empiricalQual] += nObservations;                                                              // add the number of observations for every key
        }
        empiricalQualCounts = Arrays.asList(qualHistogram);                                                             // histogram with the number of observations of the empirical qualities
-        quantizeQualityScores(nLevels);
+        quantizeQualityScores(quantizationLevels);
+
+        this.quantizationLevels = quantizationLevels;
    }


@ -55,10 +63,20 @@ public class QuantizationInfo {
        quantizedQuals = quantizer.getOriginalToQuantizedMap();                                                         // map with the original to quantized qual map (using the standard number of levels in the RAC)
    }

+    public void noQuantization() {
+        this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE;
+        for (int i = 0; i < this.quantizationLevels; i++)
+            quantizedQuals.set(i, (byte) i);
+    }
+
    public List<Byte> getQuantizedQuals() {
        return quantizedQuals;
    }

+    public int getQuantizationLevels() {
+        return quantizationLevels;
+    }
+
    public GATKReportTable generateReportTable() {
        GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
        quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -71,4 +89,16 @@ public class QuantizationInfo {
        }
        return quantizedTable;
    }
+
+    private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
+        byte lastByte = -1;
+        int quantizationLevels = 0;
+        for (byte q : quantizedQuals) {
+            if (q != lastByte) {
+                quantizationLevels++;
+                lastByte = q;
+            }
+        }
+        return quantizationLevels;
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java
@ -25,8 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
 * OTHER DEALINGS IN THE SOFTWARE.
 */

-import org.broadinstitute.sting.utils.QualityUtils;
-
 /**
 * Created by IntelliJ IDEA.
 * User: rpoplin
@ -37,10 +35,10 @@ import org.broadinstitute.sting.utils.QualityUtils;

 public class RecalDatum extends RecalDatumOptimized {

-    private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
-    private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
+    private double estimatedQReported;                                                                                  // estimated reported quality score based on combined data's individual q-reporteds and number of observations
+    private double empiricalQuality;                                                                                    // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)

-    private static final int SMOOTHING_CONSTANT = 1;                                        // used when calculating empirical qualities to avoid division by zero
+    private static final int SMOOTHING_CONSTANT = 1;                                                                    // used when calculating empirical qualities to avoid division by zero

    //---------------------------------------------------------------------------------------------------------------
    //
@ -110,7 +108,11 @@ public class RecalDatum extends RecalDatumOptimized {
    }

    private double calcExpectedErrors() {
-        return (double) this.numObservations * QualityUtils.qualToProb(estimatedQReported);
+        return (double) this.numObservations * qualToErrorProb(estimatedQReported);
+    }
+
+    private double qualToErrorProb(final double qual) {
+        return Math.pow(10.0, qual / -10.0);
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java
@ -139,7 +139,7 @@ public class RecalibrationReport {
        columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
        columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
        columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
-        return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
+        return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false);
    }

    /**
@ -154,7 +154,7 @@ public class RecalibrationReport {
        columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
        columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
        columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
-        return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
+        return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false);
    }

    /**
@ -168,7 +168,7 @@ public class RecalibrationReport {
        ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
        columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
        columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
-        return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
+        return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, true);
    }

    /**
@ -179,7 +179,7 @@ public class RecalibrationReport {
     * @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
     * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
     */
-    private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
+    private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) {
        Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);

        for (Object primaryKey : reportTable.getPrimaryKeys()) {
@ -192,10 +192,13 @@ public class RecalibrationReport {

            long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
            long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
-            double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
            double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
-            RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);

+            double estimatedQReported = hasEstimatedQReportedColumn ?                                                   // the estimatedQreported column only exists in the ReadGroup table
+                (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) :               // we get it if we are in the read group table
+                Byte.parseByte((String) reportTable.get(primaryKey, RecalDataManager.QUALITY_SCORE_COLUMN_NAME));       // or we use the reported quality if we are in any other table
+
+            RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
            result.put(bitKey, recalDatum);
        }
        return result;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;

 import org.broadinstitute.sting.utils.MathUtils;

-import java.util.ArrayList;
 import java.util.Arrays;

 /**
@ -42,12 +41,13 @@ public class AlleleFrequencyCalculationResult {
    // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
    private double log10MLE;
    private double log10MAP;
-    final private int[] alleleCountsOfMLE;
-    final private int[] alleleCountsOfMAP;
+    private final int[] alleleCountsOfMLE;
+    private final int[] alleleCountsOfMAP;

    // The posteriors seen, not including that of AF=0
-    // TODO -- better implementation needed here (see below)
-    private ArrayList<Double> log10PosteriorMatrixValues = new ArrayList<Double>(100000);
+    private static final int POSTERIORS_CACHE_SIZE = 5000;
+    private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE];
+    private int currentPosteriorsCacheIndex = 0;
    private Double log10PosteriorMatrixSum = null;

    // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
@ -69,14 +69,9 @@ public class AlleleFrequencyCalculationResult {
        return log10MAP;
    }

-    public double getLog10PosteriorMatrixSum() {
+    public double getLog10PosteriorsMatrixSumWithoutAFzero() {
        if ( log10PosteriorMatrixSum == null ) {
-            // TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory;
-            // TODO --   will discuss with the team what the best option is
-            final double[] tmp = new double[log10PosteriorMatrixValues.size()];
-            for ( int i = 0; i < tmp.length; i++ )
-                tmp[i] = log10PosteriorMatrixValues.get(i);
-            log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp);
+            log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
        }
        return log10PosteriorMatrixSum;
    }
@ -103,7 +98,7 @@ public class AlleleFrequencyCalculationResult {
            alleleCountsOfMLE[i] = 0;
            alleleCountsOfMAP[i] = 0;
        }
-        log10PosteriorMatrixValues.clear();
+        currentPosteriorsCacheIndex = 0;
        log10PosteriorMatrixSum = null;
    }

@ -116,7 +111,8 @@ public class AlleleFrequencyCalculationResult {
    }

    public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
-        log10PosteriorMatrixValues.add(log10LofK);
+        addToPosteriorsCache(log10LofK);
+
        if ( log10LofK > log10MAP ) {
            log10MAP = log10LofK;
            for ( int i = 0; i < alleleCountsForK.length; i++ )
@ -124,6 +120,18 @@ public class AlleleFrequencyCalculationResult {
        }
    }

+    private void addToPosteriorsCache(final double log10LofK) {
+        // add to the cache
+        log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK;
+
+        // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
+        if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) {
+            final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
+            log10PosteriorMatrixValues[0] = temporarySum;
+            currentPosteriorsCacheIndex = 1;
+        }
+    }
+
    public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
        this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
        if ( log10LikelihoodOfAFzero > log10MLE ) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java
@ -72,6 +72,8 @@ import static java.lang.Math.pow;
 */
 public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable {

+    public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
+
    protected final static int FIXED_PLOIDY = 2;
    protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
    protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -45,7 +45,7 @@ public class UnifiedArgumentCollection {
     * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
     */
    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
-    public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
+    public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;

    /**
     * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
@ -53,7 +53,7 @@ public class UnifiedArgumentCollection {
     * effectively acts as a cap on the base qualities.
     */
    @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
-    public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
+    public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE;

    /**
     * Specifies how to determine the alternate allele to use for genotyping
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -51,6 +51,9 @@ import java.util.*;
 public class UnifiedGenotyperEngine {
    public static final String LOW_QUAL_FILTER_NAME = "LowQual";

+    public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
+    public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
+
    public enum OUTPUT_MODE {
        /** produces calls only at variant sites */
        EMIT_VARIANTS_ONLY,
@ -324,7 +327,7 @@ public class UnifiedGenotyperEngine {
        } else {
            phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
            if ( Double.isInfinite(phredScaledConfidence) ) {
-                final double sum = AFresult.getLog10PosteriorMatrixSum();
+                final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
                phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
            }
        }
@ -367,7 +370,7 @@ public class UnifiedGenotyperEngine {

            // the overall lod
            //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
-            double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum();
+            double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
            //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);

            List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
@ -378,7 +381,7 @@ public class UnifiedGenotyperEngine {
            afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
            //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
            double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
-            double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum();
+            double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
            //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);

            // the reverse lod
@ -387,7 +390,7 @@ public class UnifiedGenotyperEngine {
            afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
            //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
            double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
-            double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum();
+            double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
            //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);

            double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -422,7 +425,7 @@ public class UnifiedGenotyperEngine {

    public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
        normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
-        normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum();
+        normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
        return MathUtils.normalizeFromLog10(normalizedPosteriors);
    }

@ -620,8 +623,6 @@ public class UnifiedGenotyperEngine {

    }

-    public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
-    public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
    protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) {
        if( model.name().contains("SNP") )
            return HUMAN_SNP_HETEROZYGOSITY;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -116,6 +116,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    @ArgumentCollection
    protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

+    /**
+     * Some analyses want to count overlap not with dbSNP (which is in general very open) but
+     * actually want to itemize their overlap specifically with a set of gold standard sites
+     * such as HapMap, OMNI, or the gold standard indels.  Theis argument provides a mechanism
+     * for communicating which file to use
+     */
+    @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false)
+    public RodBinding<VariantContext> goldStandard = null;
+
    // Help arguments
    @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
    protected Boolean LIST = false;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java
@ -56,6 +56,12 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    @DataPoint(description = "Number of singleton Indels", format = "%d")
    public int n_singleton_indels = 0;

+    @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d")
+    public int n_indels_matching_gold_standard = 0;
+
+    @DataPoint(description = "Percent of indels overlapping gold standard sites")
+    public String gold_standard_matching_rate;
+
    // counts 1 for each site where the number of alleles > 2
    public int nMultiIndelSites = 0;

@ -71,18 +77,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    @DataPoint(description = "Indel novelty rate")
    public String indel_novelty_rate;

-    @DataPoint(description = "1 to 2 bp indel ratio")
-    public String ratio_of_1_to_2_bp_indels;
-
-    @DataPoint(description = "1 to 3 bp indel ratio")
-    public String ratio_of_1_to_3_bp_indels;
-
-    @DataPoint(description = "2 to 3 bp indel ratio")
-    public String ratio_of_2_to_3_bp_indels;
-
-    @DataPoint(description = "1 and 2 to 3 bp indel ratio")
-    public String ratio_of_1_and_2_to_3_bp_indels;
-
    @DataPoint(description = "Frameshift percent")
    public String frameshift_rate_for_coding_indels;

@ -92,9 +86,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    @DataPoint(description = "Insertion to deletion ratio")
    public String insertion_to_deletion_ratio;

-    @DataPoint(description = "Insertion to deletion ratio for 1 bp events")
-    public String insertion_to_deletion_ratio_for_1bp_indels;
-
    //
    // Frameshifts
    //
@ -116,9 +107,25 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0;

    int nKnownIndels = 0, nInsertions = 0;
-    int n1bpInsertions = 0, n1bpDeletions = 0;
-    int[] countByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used

+    int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
+    int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
+
+    // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a
+    // downstream frameshift, if we make the simplifying assumptions that 3 bp ins
+    // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably
+    // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions
+    // as for deletions, and certainly would expect consistency between in/dels that
+    // multiple methods find and in/dels that are unique to one method  (since deletions
+    // are more common and the artifacts differ, it is probably worth looking at the totals,
+    // overlaps and ratios for insertions and deletions separately in the methods
+    // comparison and in this case don't even need to make the simplifying in = del functional assumption
+
+    @DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions")
+    public String ratio_of_1_and_2_to_3_bp_insertions;
+
+    @DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions")
+    public String ratio_of_1_and_2_to_3_bp_deletions;

    public final static int LARGE_INDEL_SIZE_THRESHOLD = 10;

@ -150,11 +157,11 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
                }
                break;
            case INDEL:
+                final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard);
                if ( eval.isComplexIndel() ) break; // don't count complex substitutions
                
                nIndelSites++;
                if ( ! eval.isBiallelic() ) nMultiIndelSites++;
-                if ( variantWasSingleton(eval) ) n_singleton_indels++;

                // collect information about het / hom ratio
                for ( final Genotype g : eval.getGenotypes() ) {
@ -164,15 +171,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {

                for ( Allele alt : eval.getAlternateAlleles() ) {
                    n_indels++; // +1 for each alt allele
-
+                    if ( variantWasSingleton(eval) ) n_singleton_indels++;
                    if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific?
+                    if ( gold != null ) n_indels_matching_gold_standard++;

                    // ins : del ratios
                    final int alleleSize = alt.length() - eval.getReference().length();
                    if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
                    if ( alleleSize > 0 ) nInsertions++;
-                    if ( alleleSize == 1 ) n1bpInsertions++;
-                    if ( alleleSize == -1 ) n1bpDeletions++;

                    // requires snpEFF annotations
                    if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) {
@ -193,6 +199,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
                        n_large_deletions++;
                    
                    // update the baby histogram
+                    final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength;
                    final int absSize = Math.abs(alleleSize);
                    if ( absSize < countByLength.length ) countByLength[absSize]++;

@ -210,18 +217,18 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
        percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites);
        SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels);
        SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels);
+
+        gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels);
        indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels);
-        ratio_of_1_to_2_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[2]);
-        ratio_of_1_to_3_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[3]);
-        ratio_of_2_to_3_bp_indels = Utils.formattedRatio(countByLength[2], countByLength[3]);
-        ratio_of_1_and_2_to_3_bp_indels = Utils.formattedRatio(countByLength[1] + countByLength[2], countByLength[3]);
        frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting);

+        ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]);
+        ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]);
+
        SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms);
        indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms);

        insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions);
-        insertion_to_deletion_ratio_for_1bp_indels = Utils.formattedRatio(n1bpInsertions, n1bpDeletions);
        insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions);

    }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java
@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2012, The Broad Institute 
+ *
+ * Permission is hereby granted, free of charge, to any person 
+ * obtaining a copy of this software and associated documentation 
+ * files (the "Software"), to deal in the Software without 
+ * restriction, including without limitation the rights to use, 
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell 
+ * copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following 
+ * conditions: 
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software. 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Stratifies the eval RODs into sites where the indel is 1 bp in length and those where the event is 2+.
+ * all non indel events go into all bins, so that SNP counts can be used as contrasts in eval modules.
+ */
+public class OneBPIndel extends VariantStratifier {
+    private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"one.bp", (Object)"two.plus.bp");
+    private final static List<Object> ONE_BP = Arrays.asList((Object)"all", (Object)"one.bp");
+    private final static List<Object> TWO_PLUS_BP = Arrays.asList((Object)"all", (Object)"two.plus.bp");
+
+    @Override
+    public void initialize() {
+        states.addAll(ALL);
+    }
+
+    @Override
+    public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
+        if (eval != null && eval.isIndel()) {
+            for ( int l : eval.getIndelLengths() )
+                if ( l > 1 )
+                    return TWO_PLUS_BP; // someone is too long
+            return ONE_BP; // all lengths are one
+        } else
+            return ALL;
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java
@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2012, The Broad Institute 
+ *
+ * Permission is hereby granted, free of charge, to any person 
+ * obtaining a copy of this software and associated documentation 
+ * files (the "Software"), to deal in the Software without 
+ * restriction, including without limitation the rights to use, 
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell 
+ * copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following 
+ * conditions: 
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software. 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+
+import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Stratifies the eval RODs into sites that are tandem repeats
+ */
+public class TandemRepeat extends VariantStratifier {
+    private final static List<Object> JUST_ALL = Arrays.asList((Object)"all");
+    private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"is.repeat", (Object)"not.repeat");
+    private final static List<Object> REPEAT = Arrays.asList((Object)"all", (Object)"is.repeat");
+    private final static List<Object> NOT_REPEAT = Arrays.asList((Object)"all", (Object)"not.repeat");
+
+    @Override
+    public void initialize() {
+        states.addAll(ALL);
+    }
+
+    @Override
+    public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
+        if ( eval == null || ! eval.isIndel() )
+            return ALL;
+        else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) {
+            print("REPEAT", eval, ref);
+            return REPEAT;
+        } else {
+            print("NOT A REPEAT", eval, ref);
+            return NOT_REPEAT;
+        }
+    }
+    
+    private final void print(String prefix, VariantContext eval, ReferenceContext ref) {
+//        String alleles = ParsingUtils.sortList(eval.getAlleles()).toString();
+//        this.getVariantEvalWalker().getLogger().info(prefix + ": " + "pos=" + eval.getStart() + " alleles=" + alleles + " ref=" + new String(ref.getForwardBases()));
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@ -27,12 +27,11 @@ package org.broadinstitute.sting.utils;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;

 import java.util.*;

@ -41,6 +40,7 @@ public class Haplotype {
    protected final double[] quals;
    private GenomeLoc genomeLocation = null;
    private HashMap<String, double[]> readLikelihoodsPerSample = null;
+    private HashMap<Integer, VariantContext> eventMap = null;
    private boolean isRef = false;
    private Cigar cigar;
    private int alignmentStartHapwrtRef;
@ -97,6 +97,14 @@ public class Haplotype {
        return readLikelihoodsPerSample.keySet();
    }

+    public HashMap<Integer, VariantContext> getEventMap() {
+        return eventMap;
+    }
+
+    public void setEventMap( final HashMap<Integer, VariantContext> eventMap ) {
+        this.eventMap = eventMap;
+    }
+
    public boolean isReference() {
        return isRef;
    }
--- a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java
@ -224,10 +224,6 @@ public class IndelUtils {
        return inds;
    }

-    public static String[] getIndelClassificationNames() {
-        return COLUMN_KEYS;
-    }
-
    public static String getIndelClassificationName(int k) {
        if (k >=0 && k < COLUMN_KEYS.length)
            return COLUMN_KEYS[k];
@ -235,35 +231,6 @@ public class IndelUtils {
            throw new ReviewedStingException("Invalid index when trying to get indel classification name");
    }

-    public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) {
-        ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
-
-        boolean isIt = false;
-        for (int k : inds) {
-            if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_T) {
-                isIt = true;
-                break;
-            }
-        }
-
-        return isIt;
-
-    }
-    public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) {
-        ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
-
-         boolean isIt = false;
-         for (int k : inds) {
-             if (k == IND_FOR_REPEAT_EXPANSION_C || k == IND_FOR_REPEAT_EXPANSION_G) {
-                 isIt = true;
-                 break;
-             }
-         }
-
-         return isIt;
-
-    }
-
    public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) {
        return (vc.getStart() != ref.getLocus().getStart());
    }
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@ -237,7 +237,7 @@ public class MathUtils {
    public static double log10sumLog10(double[] log10p, int start, int finish) {
        double sum = 0.0;

-        double maxValue = Utils.findMaxEntry(log10p);
+        double maxValue = arrayMax(log10p, finish);
        if(maxValue == Double.NEGATIVE_INFINITY)
            return sum;

@ -554,7 +554,7 @@ public class MathUtils {

        // for precision purposes, we need to add (or really subtract, since they're
        // all negative) the largest value; also, we need to convert to normal-space.
-        double maxValue = Utils.findMaxEntry(array);
+        double maxValue = arrayMax(array);

        // we may decide to just normalize in log space without converting to linear space
        if (keepInLogSpace) {
@ -627,10 +627,14 @@ public class MathUtils {
        return maxI;
    }

-    public static double arrayMax(double[] array) {
+    public static double arrayMax(final double[] array) {
        return array[maxElementIndex(array)];
    }

+    public static double arrayMax(final double[] array, final int endIndex) {
+        return array[maxElementIndex(array, endIndex)];
+    }
+
    public static double arrayMin(double[] array) {
        return array[minElementIndex(array)];
    }
--- a/public/java/src/org/broadinstitute/sting/utils/Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java
@ -290,32 +290,6 @@ public class Utils {
        return m;
    }

-
-    // returns the maximum value in the array
-    public static double findMaxEntry(double[] array) {
-        return findIndexAndMaxEntry(array).first;
-    }
-
-    // returns the index of the maximum value in the array
-    public static int findIndexOfMaxEntry(double[] array) {
-        return findIndexAndMaxEntry(array).second;
-    }
-
-    // returns the the maximum value and its index in the array
-    private static Pair<Double, Integer> findIndexAndMaxEntry(double[] array) {
-        if ( array.length == 0 )
-            return new Pair<Double, Integer>(0.0, -1);
-        int index = 0;
-        double max = array[0];
-        for (int i = 1; i < array.length; i++) {
-            if ( array[i] > max ) {
-                max = array[i];
-                index = i;
-            }
-        }
-        return new Pair<Double, Integer>(max, index);
-    }
-
    /**
     * Splits expressions in command args by spaces and returns the array of expressions.
     * Expressions may use single or double quotes to group any individual expression, but not both.
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java
@ -44,23 +44,24 @@ import java.util.*;
 public class BaseRecalibration {
    private QuantizationInfo quantizationInfo;                                                                          // histogram containing the map for qual quantization (calculated after recalibration is done)
    private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap;                                    // quick access reference to the read group table and its key manager
-
    private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();                                      // list of all covariates to be used in this calculation

-    private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
-    private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";

    /**
     * Constructor using a GATK Report file
     * 
     * @param RECAL_FILE a GATK Report file containing the recalibration information
     */
-    public BaseRecalibration(final File RECAL_FILE) {
+    public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) {
        RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);

-        quantizationInfo = recalibrationReport.getQuantizationInfo();
        keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
        requestedCovariates = recalibrationReport.getRequestedCovariates();
+        quantizationInfo = recalibrationReport.getQuantizationInfo();
+        if (quantizationLevels == 0)                                                                                    // quantizationLevels == 0 means no quantization, preserve the quality scores
+            quantizationInfo.noQuantization();
+        else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels())              // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report.
+            quantizationInfo.quantizeQualityScores(quantizationLevels);
    }

    /**
@ -71,17 +72,17 @@ public class BaseRecalibration {
     * @param read the read to recalibrate
     */
    public void recalibrateRead(final GATKSAMRecord read) {
-        final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates);    // compute all covariates for the read
-        for (final EventType errorModel : EventType.values()) {                                                 // recalibrate all three quality strings
+        final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates);            // compute all covariates for the read
+        for (final EventType errorModel : EventType.values()) {                                                         // recalibrate all three quality strings
            final byte[] originalQuals = read.getBaseQualities(errorModel);
            final byte[] recalQuals = originalQuals.clone();

-            for (int offset = 0; offset < read.getReadLength(); offset++) {                                     // recalibrate all bases in the read
+            for (int offset = 0; offset < read.getReadLength(); offset++) {                                             // recalibrate all bases in the read
                byte qualityScore = originalQuals[offset];

-                if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) {                                           // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
-                    final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel);                       // get the keyset for this base using the error model
-                    qualityScore = performSequentialQualityCalculation(keySet, errorModel);                     // recalibrate the base
+                if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) {                                                   // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
+                    final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel);                               // get the keyset for this base using the error model
+                    qualityScore = performSequentialQualityCalculation(keySet, errorModel);                             // recalibrate the base
                }
                recalQuals[offset] = qualityScore;
            }
@ -109,6 +110,9 @@ public class BaseRecalibration {
     * @return A recalibrated quality score as a byte
     */
    private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
+        final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
+        final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
+
        final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);

        double globalDeltaQ = 0.0;
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@ -1202,4 +1202,80 @@ public class VariantContextUtils {
        final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
        return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
    }
+
+    /**
+     * Returns true iff VC is an non-complex indel where every allele represents an expansion or 
+     * contraction of a series of identical bases in the reference.
+     * 
+     * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT
+     * 
+     * If VC = -/CT, then this function returns true because the CT insertion matches exactly the
+     * upcoming reference.  
+     * If VC = -/CTA then this function returns false because the CTA isn't a perfect match
+     * 
+     * Now consider deletions:
+     * 
+     * If VC = CT/- then again the same logic applies and this returns true
+     * The case of CTA/- makes no sense because it doesn't actually match the reference bases.  
+     * 
+     * The logic of this function is pretty simple.  Take all of the non-null alleles in VC.  For
+     * each insertion allele of n bases, check if that allele matches the next n reference bases.
+     * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n,
+     * as it must necessarily match the first n bases.  If this test returns true for all 
+     * alleles you are a tandem repeat, otherwise you are not.
+     * 
+     * @param vc
+     * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference
+     * @return
+     */
+    @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"})
+    public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) {
+        final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1);
+        if ( ! vc.isIndel() ) // only indels are tandem repeats
+            return false;
+        
+        final Allele ref = vc.getReference();
+
+        for ( final Allele allele : vc.getAlternateAlleles() ) {
+            if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) )
+                return false;
+        }
+
+        // we've passed all of the tests, so we are a repeat
+        return true;
+    }
+
+    /**
+     * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
+     * @param ref
+     * @param alt
+     * @param refBasesStartingAtVCWithoutPad
+     * @return
+     */
+    protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) {
+        if ( ! Allele.oneIsPrefixOfOther(ref, alt) )
+            return false; // we require one allele be a prefix of another
+        
+        if ( ref.length() > alt.length() ) { // we are a deletion
+            return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2);
+        } else { // we are an insertion
+            return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1);
+        }
+    }
+    
+    protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) {
+        final String potentialRepeat = l.substring(s.length()); // skip s bases
+        
+        for ( int i = 0; i < minNumberOfMatches; i++) {
+            final int start = i * potentialRepeat.length();
+            final int end = (i+1) * potentialRepeat.length();
+            if ( ref.length() < end )
+                 return false; // we ran out of bases to test
+            final String refSub = ref.substring(start, end);
+            if ( ! refSub.equals(potentialRepeat) )
+                return false; // repeat didn't match, fail
+        }
+
+        return true; // we passed all tests, we matched
+    }
 }
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
                " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf";
        WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s",
-                1, Arrays.asList("c8a782f51e094dc7be06dbfb795feab2"));
+                1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65"));
        executeTestParallel("testSelect1", spec);
    }

@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
    @Test
    public void testCompVsEvalAC() {
        String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5c409a2ab4517f862c6678902c0fd7a1"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00"));
        executeTestParallel("testCompVsEvalAC",spec);
    }

@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                           " --dbsnp " + b37dbSNP132 +
                           " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
                           " -noST -ST Novelty -o %s";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a27c700eabe6b7b3877c8fd4eabb3975"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2"));
        executeTestParallel("testEvalTrackWithoutGenotypes",spec);
    }

@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
                " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" +
                " -noST -ST Novelty -o %s";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3272a2db627d4f42bc512df49a8ea64b"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a"));
        executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec);
    }

@ -488,11 +488,32 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                                        "-o %s"
                                ),
                                1,
-                                Arrays.asList("41a37636868a838a632559949c5216cf")
+                                Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3")
                              );
        executeTest("testModernVCFWithLargeIndels", spec);
    }

+    @Test
+    public void testStandardIndelEval() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                buildCommandLine(
+                        "-T VariantEval",
+                        "-R " + b37KGReference,
+                        "-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf",
+                        "-L 20",
+                        "-noST -ST Sample -ST OneBPIndel -ST TandemRepeat",
+                        "-noEV -EV IndelSummary -EV IndelLengthHistogram",
+                        "-gold " + validationDataLocation + "/Mills_and_1000G_gold_standard.indels.b37.sites.vcf",
+                        "-D " + b37dbSNP132,
+                        "-o %s"
+                ),
+                1,
+                Arrays.asList("c89705147ef4233d5de3a539469bd1d1")
+        );
+        executeTest("testStandardIndelEval", spec);
+    }
+
+
    @Test()
    public void testIncompatibleEvalAndStrat() {
        WalkerTestSpec spec = new WalkerTestSpec(
--- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
@ -284,6 +284,18 @@ public class MathUtilsUnitTest extends BaseTest {
        Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211}));
    }

+    @Test
+    public void testLog10sumLog10() {
+        final double log3 = 0.477121254719662;
+        Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3), 0);
+        Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3), 0);
+        Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3), 0);
+
+        final double log2 = 0.301029995663981;
+        Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2), 0);
+        Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0), 0);
+    }
+
    @Test
    public void testDotProduct() {
        Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3);
--- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java
@ -20,7 +20,7 @@ public class BaseRecalibrationUnitTest {
    @Test(enabled=false)
    public void testReadingReport() {
        File csv = new File("public/testdata/exampleGATKREPORT.grp");
-        BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
+        BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1);
        GATKSAMRecord read = ReadUtils.createRandomRead(1000);
        read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
        baseRecalibration.recalibrateRead(read);
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java
@ -589,4 +589,76 @@ public class VariantContextUtilsUnitTest extends BaseTest {

        return priority;
    }
+
+
+    // --------------------------------------------------------------------------------
+    //
+    // Test repeats
+    //
+    // --------------------------------------------------------------------------------
+
+    private class RepeatDetectorTest extends TestDataProvider {
+        String ref;
+        boolean isTrueRepeat;
+        VariantContext vc;
+
+        private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) {
+            super(RepeatDetectorTest.class);
+            this.ref = "N" + ref; // add a dummy base for the event here
+            this.isTrueRepeat = isTrueRepeat;
+
+            List<Allele> alleles = new LinkedList<Allele>();
+            final Allele refAllele = Allele.create(refAlleleString, true);
+            alleles.add(refAllele);
+            for ( final String altString: altAlleleStrings) {
+                final Allele alt = Allele.create(altString, false);
+                alleles.add(alt);
+            }
+
+            VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles);
+            this.vc = builder.make();
+        }
+
+        public String toString() {
+            return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc);
+        }
+    }
+
+    @DataProvider(name = "RepeatDetectorTest")
+    public Object[][] makeRepeatDetectorTest() {
+        new RepeatDetectorTest(true,  "AAC", "-", "A");
+        new RepeatDetectorTest(true,  "AAC", "A", "-");
+        new RepeatDetectorTest(false, "AAC", "AA", "-");
+        new RepeatDetectorTest(false, "AAC", "-", "C");
+        new RepeatDetectorTest(false, "AAC", "A", "C");
+
+        // running out of ref bases => false
+        new RepeatDetectorTest(false, "AAC", "-", "CAGTA");
+
+        // complex repeats
+        new RepeatDetectorTest(true,  "ATATATC", "-", "AT");
+        new RepeatDetectorTest(true,  "ATATATC", "-", "ATA");
+        new RepeatDetectorTest(true,  "ATATATC", "-", "ATAT");
+        new RepeatDetectorTest(true,  "ATATATC", "AT", "-");
+        new RepeatDetectorTest(false, "ATATATC", "ATA", "-");
+        new RepeatDetectorTest(false, "ATATATC", "ATAT", "-");
+
+        // multi-allelic
+        new RepeatDetectorTest(true,  "ATATATC", "-", "AT", "ATAT");
+        new RepeatDetectorTest(true,  "ATATATC", "-", "AT", "ATA");
+        new RepeatDetectorTest(true,  "ATATATC", "AT", "-", "ATAT");
+        new RepeatDetectorTest(true,  "ATATATC", "AT", "-", "ATA"); // two As
+        new RepeatDetectorTest(false, "ATATATC", "AT", "-", "ATC"); // false
+        new RepeatDetectorTest(false, "ATATATC", "AT", "-", "CC"); // false
+        new RepeatDetectorTest(false, "ATATATC", "AT", "ATAT", "CC"); // false
+
+        return RepeatDetectorTest.getTests(RepeatDetectorTest.class);
+    }
+
+    @Test(dataProvider = "RepeatDetectorTest")
+    public void testRepeatDetectorTest(RepeatDetectorTest cfg) {
+
+         // test alleles are equal
+        Assert.assertEquals(VariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat);
+    }
 }
--- a/public/testdata/exampleGRP.grp
+++ b/public/testdata/exampleGRP.grp