Initial commit of tools under development for data QC through firehose.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2834 348d0f76-0448-11de-a6fe-93d51630548a
2010-02-12 19:13:24 +00:00 · 2010-02-12 19:13:24 +00:00 · 04a2784bf7
parent 77af5822d4
commit 04a2784bf7
6 changed files with 564 additions and 42 deletions
--- a/java/src/org/broadinstitute/sting/oneoffprojects/filters/ContaminatedSampleFilter.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/filters/ContaminatedSampleFilter.java
@ -0,0 +1,24 @@
+package org.broadinstitute.sting.oneoffprojects.filters;
+
+import net.sf.picard.filter.SamRecordFilter;
+import net.sf.samtools.SAMRecord;
+
+import java.util.Arrays;
+import java.util.HashSet;
+
+/**
+ * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
+ *
+ * @Author chartl
+ * @Date Feb 8, 2010
+ */
+public class ContaminatedSampleFilter implements SamRecordFilter {
+
+    private final String[] filteredNames = {"NA19562","NA19006","NA19554","NA18985","NA18988","NA19062","NA19559"};
+
+    private HashSet<String> contaminatedSamples = new HashSet<String>( Arrays.asList(filteredNames) );
+
+    public boolean filterOut(SAMRecord read) {
+        return contaminatedSamples.contains(read.getReadGroup().getSample());
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/DepthStatisticsCalculator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/DepthStatisticsCalculator.java
@ -0,0 +1,128 @@
+package org.broadinstitute.sting.oneoffprojects.firehosesummary;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+/**
+ * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
+ *
+ * @Author chartl
+ * @Date Feb 12, 2010
+ */
+public class DepthStatisticsCalculator extends SummaryStatisticsCalculator {
+    private int numBasesAbove0x; // % of bases with ANY coverage
+    private int numBasesAbove3x; // % of bases with 4x+
+    private int numBasesAbove9x; // % of bases with 10x+
+    private int numBasesAbove24x; // % of bases with 25x+
+    private int numBasesAbove49x; // % of bases with 50x+
+    private int numBasesAbove99x; // % of bases with 100x+
+
+    private int targetsAbove0x;
+    private int targetsAbove3x;
+    private int targetsAbove9x;
+    private int targetsAbove24x;
+    private int targetsAbove49x;
+    private int targetsAbove99x;
+    private int numTargets;
+
+    public static double[] DEPTH_CUTOFFS = {1,4,10,25,50,100};
+
+    public DepthStatisticsCalculator(String name) {
+        super(name);
+        numBasesAbove0x = 0;
+        numBasesAbove3x = 0;
+        numBasesAbove9x = 0;
+        numBasesAbove24x = 0;
+        numBasesAbove49x = 0;
+        numBasesAbove99x = 0;
+        targetsAbove99x = 0;
+        targetsAbove49x = 0;
+        targetsAbove24x = 0;
+        targetsAbove9x = 0;
+        targetsAbove3x = 0;
+        targetsAbove0x = 0;
+        numTargets = 0;
+    }
+
+    public void updateLocus(int depth) {
+        super.update(depth);
+
+        if ( depth > 99 ) {
+            numBasesAbove99x++;
+        }
+        if ( depth > 49 ) {
+            numBasesAbove49x++;
+        }
+        if ( depth > 24 ) {
+            numBasesAbove24x++;
+        }
+        if ( depth > 9 ) {
+            numBasesAbove9x++;
+        }
+        if ( depth > 3 ) {
+            numBasesAbove3x++;
+        }
+        if ( depth > 0 ) {
+            numBasesAbove0x++;
+        }
+    }
+
+    public void updateTargets(int targetLength, int totalCoverage) {
+        double avgCvg = ( (double) totalCoverage ) / ( (double) targetLength);
+
+        if ( avgCvg >= 100 ) {
+            targetsAbove99x ++;
+        }
+        if ( avgCvg >= 50 ) {
+            targetsAbove49x++;
+        }
+        if ( avgCvg >= 25 ) {
+            targetsAbove24x++;
+        }
+        if ( avgCvg >= 10 ) {
+            targetsAbove9x++;
+        }
+        if ( avgCvg >= 4 ) {
+            targetsAbove3x++;
+        }
+        if ( avgCvg >= 1 ) {
+            targetsAbove0x++;
+        }
+
+        numTargets++;
+    }
+
+    public double[] getLocusProportions() {
+        double[] proportions = new double[6];
+
+        proportions[0] = ( (double) numBasesAbove0x )/( (double) getSampleSize() );
+        proportions[2] = ( (double) numBasesAbove9x )/( (double) getSampleSize() );
+        proportions[3] = ( (double) numBasesAbove24x )/( (double) getSampleSize() );
+        proportions[4] = ( (double) numBasesAbove49x )/( (double) getSampleSize() );
+        proportions[5] = ( (double) numBasesAbove99x )/( (double) getSampleSize() );
+        proportions[1] = ( (double) numBasesAbove3x )/( (double) getSampleSize() );
+
+        return proportions;
+    }
+
+    public double[] getTargetProportions() {
+        double[] proportions = new double[6];
+
+        proportions[0] = ( (double) targetsAbove0x )/( (double) numTargets );
+        proportions[2] = ( (double) targetsAbove9x )/( (double) numTargets );
+        proportions[3] = ( (double) targetsAbove24x )/( (double) numTargets );
+        proportions[4] = ( (double) targetsAbove49x )/( (double) numTargets );
+        proportions[5] = ( (double) targetsAbove99x )/( (double) numTargets );
+        proportions[1] = ( (double) targetsAbove3x )/( (double) numTargets );
+
+        return proportions;
+    }
+
+    public double getPercentWellCoveredLoci() {
+        return 10*( (double) numBasesAbove9x )/( (double) getSampleSize() );
+    }
+
+    public double getPercentWellCoveredTargets() {
+        return 10*( (double) targetsAbove9x )/( (double) numTargets );
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/GenerateFirehoseSummary.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/GenerateFirehoseSummary.java
@ -0,0 +1,257 @@
+package org.broadinstitute.sting.oneoffprojects.firehosesummary;
+
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
+ *
+ * @Author chartl
+ * @Date Feb 11, 2010
+ */
+
+class FirehoseSummaryCLP extends CommandLineProgram {
+    @Argument(fullName = "depthOfCoverageFile", shortName = "doc", doc="Path to the depth of coverage file", required=true)
+    private File depthOfCoverage = null;
+    @Argument(fullName = "contaminationFile", shortName = "con", doc="Path to the contamination file", required=true)
+    private File contamination = null;
+    @Argument(fullName = "errorRateFile", shortName = "err", doc="Path to the error rate file", required=true)
+    private File errorRate = null;
+    @Argument(fullName = "zipFiles", shortName = "zip", doc="List of paths to zip files which contain summary metrics files", required=false)
+    private String zipFiles = null;
+
+    private static String R_SCRIPT = "plotFirehoseDataQCMetrics.R";
+    private static String SCRIPT_DOC_FLAG = "DOC";
+
+    protected int execute() {
+        SummaryFileCollection metricsFiles = getFileHandles();
+        List<DepthStatisticsCalculator> depthStats = calculateDepthStatistics(depthOfCoverage);
+        String docSummary = makeDOCPlots(depthStats);
+        return 1;
+    }
+
+    private String makeDOCPlots(List<DepthStatisticsCalculator> calcs) {
+        StringBuilder summaryBuilder = new StringBuilder();
+        int numSamplesWithGoodBaseCoverage=0;
+        int numSamplesWithGoodMeanCoverage = 0;
+        int numSamplesWithGoodTargetCoverage=0;
+        double aggregateMeanCoverage = -1d;
+        double aggregateStdDevCoverage = -1d;
+        double aggregateSkewCoverage = -1d;
+        PrintWriter locusOutput;
+        PrintWriter targetOutput;
+        File locusOutputFile;
+        File targetOutputFile;
+        try {
+            locusOutputFile = File.createTempFile("locus_output","txt");
+            locusOutput = new PrintWriter(locusOutputFile);
+            targetOutputFile = File.createTempFile("target_output","txt");
+            targetOutput = new PrintWriter(targetOutputFile);
+        } catch ( IOException e ) {
+            throw new StingException("Error creating temporary output files for R-plotting. Perhaps user permissions do not allow creation of files.",e);
+        }
+
+        for ( DepthStatisticsCalculator calc : calcs ) {
+
+            if ( calc.getName().equalsIgnoreCase("total_coverage") || calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
+                // update summary statistics
+                if ( calc.getName().equalsIgnoreCase("total_coverage") ) {
+                    aggregateMeanCoverage = calc.getMean();
+                    aggregateStdDevCoverage = Math.sqrt(calc.getVar());
+                    aggregateSkewCoverage = calc.getSkew();
+                }
+            } else {
+                // update sample-based summary statistics and print output to file
+                locusOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n",calc.getName(),calc.getLocusProportions());
+                targetOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n", calc.getName(),calc.getTargetProportions());
+                if ( calc.getPercentWellCoveredLoci() > 80 ) {
+                    numSamplesWithGoodBaseCoverage++;
+                }
+                if ( calc.getPercentWellCoveredTargets() > 90 ) {
+                    numSamplesWithGoodTargetCoverage++;
+                }
+                if ( calc.getMean() > 10 ) {
+                    numSamplesWithGoodMeanCoverage++;
+                }
+            }
+        }
+
+        //invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfBasesCoveredBySample",locusOutputFile.getAbsolutePath());
+        //invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfTargetsCoveredBySample",targetOutputFile.getAbsolutePath());
+        return "temporary";
+    }
+
+    private SummaryFileCollection getFileHandles() {
+        if ( zipFiles == null ) {
+            return null;
+        }
+
+        SummaryFileCollection summaryFiles = new SummaryFileCollection();
+        for ( String zipFile : zipFiles.split(",") ) {
+            summaryFiles.process(zipFile);
+        }
+
+        return summaryFiles;
+    }
+
+    private List<DepthStatisticsCalculator> calculateDepthStatistics(File docFile) {
+        BufferedReader docReader;
+
+        try {
+            docReader = new BufferedReader( new FileReader(docFile) );
+        } catch ( IOException e) {
+            throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e);
+        }
+
+        String locusHeader = getDOCSectionHeader(docReader);
+        List<DepthStatisticsCalculator> docCalculators = instantiateDOCCalculators(locusHeader);
+
+        updateLocusInfo(docCalculators,docReader);
+
+        String targetHeader = getDOCSectionHeader(docReader);
+
+        updateTargetInfo(docCalculators,docReader);
+
+        return docCalculators;
+    }
+
+    private void updateLocusInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
+
+        String docLocLine;
+        try {
+            docLocLine = reader.readLine();
+            while ( ! isEndOfSection(docLocLine) ) {
+                int offset = -1;
+                for ( String entry : docLocLine.split("\t") ) {
+                    if ( offset > -1 ) {
+                        calcs.get(offset).updateLocus(Integer.parseInt(entry));
+                    }
+                    offset++;
+                }
+            }
+        } catch ( IOException e) {
+            throw new StingException("Error reading locus depth of coverage information",e);
+        }
+
+    }
+
+    private void updateTargetInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
+
+        String docLocLine;
+        try {
+            docLocLine = reader.readLine();
+            while ( ! isEndOfSection(docLocLine) ) {
+                int offset = -1;
+                int targetSize = 0;
+                for ( String entry : docLocLine.split("\t") ) {
+                    if ( offset == -1 ) {
+                        targetSize = parseInterval(entry);
+                    } else {
+                        calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry));
+                    }
+                    offset++;
+                }
+            }
+        } catch ( IOException e ) {
+            throw new StingException("Error reading target depth of coverage information",e);
+        }
+
+    }
+
+    private int parseInterval(String interval) {
+        String startstop = interval.split(":")[1];
+        int start = Integer.parseInt(startstop.split("-")[0]);
+        int stop = Integer.parseInt(startstop.split("-")[1]);
+        return stop - start;
+    }
+
+    private boolean isDOCSectionSeparator( String line ) {
+        return line.contains("_COVERAGE_SECTION");
+    }
+
+    private boolean isEndOfSection( String line ) {
+        // sections delimited by empty line
+        return line.equalsIgnoreCase("");
+    }
+
+    private String getDOCSectionHeader(BufferedReader reader) {
+        String header;
+        try {
+            do {
+                header = reader.readLine();
+            } while ( ! isDOCSectionSeparator(header) );
+
+            header = reader.readLine();
+
+        } catch (IOException e) {
+            throw new StingException("Error reading depth of coverage file",e);
+        }
+
+        return header;
+    }
+
+    private List<DepthStatisticsCalculator> instantiateDOCCalculators(String header) {
+        List<DepthStatisticsCalculator> calcs = new ArrayList<DepthStatisticsCalculator>();
+
+        int offset = -1;
+        for ( String entry : header.split("\t") ) {
+            if ( offset > -1 ) {
+                calcs.add(new DepthStatisticsCalculator(entry));
+            }
+            offset++;
+        }
+
+        return calcs;
+    }
+
+}
+
+public class GenerateFirehoseSummary {
+
+    public static void main(String[] args) {
+        FirehoseSummaryCLP clp = new FirehoseSummaryCLP();
+        CommandLineProgram.start(clp, args);
+        System.exit(0);
+    }
+}
+
+class SummaryFileCollection {
+
+    // container class for files we'll be summarizing
+
+    public Map<String,File> fingerprintSummaryFiles;
+    public Map<String,File> hybridSelectionMetricsFiles;
+    public Map<String,File> insertSizeDistributionFiles;
+    public Map<String,File> alignmentMetricsFiles;
+
+    public SummaryFileCollection() {
+        fingerprintSummaryFiles = new HashMap<String,File>();
+        hybridSelectionMetricsFiles = new HashMap<String, File>();
+        insertSizeDistributionFiles = new HashMap<String,File>();
+        alignmentMetricsFiles = new HashMap<String,File>();
+    }
+
+    public void process(String zipFilePath) {
+        String sampleName = zipFilePath.split("_sequencing_metrics.zip")[0].split("_")[1];
+        File fingerprintSummaryFile = new File(sampleName+".summary_fingerprint_metrics");
+        File hybridSelectionFile = new File(sampleName+".hybrid_selection_metrics");
+        File insertSizeFile = new File(sampleName+".insert_size_metrics");
+        File alignmentFile = new File(sampleName+".alignment_metrics");
+
+        String command = "unzip "+zipFilePath;
+        try {
+            Process p = Runtime.getRuntime().exec(command);
+        } catch (IOException e) {
+            throw new RuntimeException("Could not unzip the file "+zipFilePath);
+        }
+
+        fingerprintSummaryFiles.put(sampleName,fingerprintSummaryFile);
+        hybridSelectionMetricsFiles.put(sampleName,hybridSelectionFile);
+        insertSizeDistributionFiles.put(sampleName,insertSizeFile);
+        alignmentMetricsFiles.put(sampleName,alignmentFile);
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/SummaryStatisticsCalculator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/SummaryStatisticsCalculator.java
@ -0,0 +1,69 @@
+package org.broadinstitute.sting.oneoffprojects.firehosesummary;
+
+/**
+ * This object calculates the first three sample moments of a data set on-the-fly
+ * @Author chartl
+ * @Date Feb 12, 2010
+ */
+public class SummaryStatisticsCalculator {
+    // todo -- can median / quantiles be estimated on-the-fly?
+    // todo -- can we divine a metric for modality?
+    private double mean; // mean of the samples fed to it
+    private double var; // variance of the samples fed to it
+    private double skew; // skew of the samples fed to it
+    private int sampleSize; // number of samples given to calculator
+    private String name; // name to associate calculator with, if any
+
+    public SummaryStatisticsCalculator() {
+        mean = 0;
+        var = 0;
+        skew = 0;
+        sampleSize = 0;
+        name = "SUMMARY";
+    }
+
+    public SummaryStatisticsCalculator(String name) {
+        mean = 0;
+        var = 0;
+        skew = 0;
+        sampleSize = 0;
+        this.name = name;
+    }
+
+    public void update(double datum) {
+        mean = (sampleSize*mean + datum)/(sampleSize + 1);
+        var = (sampleSize*var + Math.pow( mean - datum , 2 ) )/( sampleSize + 1);
+        // note: the variance is not re-calculated over all data points each time the mean changes
+        // this means the convergence will be slower than x^-(1/2); but it will still converge with the mean
+        // whole exome is ~3.3 million datapoints, which will be plenty for this to converge to within a very
+        // small proportion of the true sample variance
+        skew  =  ( skew*Math.pow(var,2/3)*sampleSize+ Math.pow( datum - mean, 3) ) / ( ( sampleSize + 1 )* Math.pow(var,2/3) );
+        // again, new mean/variance estimates are not propagated over all previous data points, so convergence is a bit slower
+        // but that's the price one pays for on-the-fly calculation
+        sampleSize++;
+    }
+    
+    public void update(int datum) {
+        update( (double) datum);
+    }
+
+    public double getMean() {
+        return mean;
+    }
+
+    public double getVar() {
+        return var;
+    }
+
+    public double getSkew() {
+        return skew;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public int getSampleSize() {
+        return sampleSize;
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/VCFConcordanceCalculator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/VCFConcordanceCalculator.java
@ -20,33 +20,33 @@ class VCFConcordanceCalculator {
    private int minimumDepthForUpdate;
    private int minimumGenotypeQuality;
    private String name;
-    private Set<GenomeLoc> falsePositiveLoci;
-    private Set<GenomeLoc> falseNegativeLoci;
-    private Set<GenomeLoc> falseNegativeLociDueToNoCall;
-    private Set<GenomeLoc> falseNegativeLociDueToFilters;
-    private Set<GenomeLoc> hetsCalledHoms;
-    private Set<GenomeLoc> homsCalledHets;
-    private Set<GenomeLoc> nonConfidentGenotypeCalls;
-    private Set<GenomeLoc> concordantHomCalls;
-    private Set<GenomeLoc> concordantHetCalls;
-    private Set<GenomeLoc> concordantGenotypeReferenceCalls;
-    private Set<GenomeLoc> chipNoCalls;
-    private Set<GenomeLoc> ignoredDueToDepth;
+    private int falsePositiveLoci;
+    private int falseNegativeLoci;
+    private int falseNegativeLociDueToNoCall;
+    private int falseNegativeLociDueToFilters;
+    private int hetsCalledHoms;
+    private int homsCalledHets;
+    private int nonConfidentGenotypeCalls;
+    private int concordantHomCalls;
+    private int concordantHetCalls;
+    private int concordantGenotypeReferenceCalls;
+    private int chipNoCalls;
+    private int ignoredDueToDepth;

    public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) {
        name = sampleName;
-        falseNegativeLoci = new HashSet<GenomeLoc>();
-        falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
-        falsePositiveLoci = new HashSet<GenomeLoc>();
-        falseNegativeLociDueToFilters = new HashSet<GenomeLoc>();
-        hetsCalledHoms = new HashSet<GenomeLoc>();
-        homsCalledHets = new HashSet<GenomeLoc>();
-        nonConfidentGenotypeCalls = new HashSet<GenomeLoc>();
-        concordantHomCalls = new HashSet<GenomeLoc>();
-        concordantHetCalls = new HashSet<GenomeLoc>();
-        concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
-        chipNoCalls = new HashSet<GenomeLoc>();
-        ignoredDueToDepth = new HashSet<GenomeLoc>();
+        falseNegativeLoci = 0;
+        falseNegativeLociDueToNoCall = 0;
+        falsePositiveLoci = 0;
+        falseNegativeLociDueToFilters = 0;
+        hetsCalledHoms = 0;
+        homsCalledHets = 0;
+        nonConfidentGenotypeCalls = 0;
+        concordantHomCalls = 0;
+        concordantHetCalls = 0;
+        concordantGenotypeReferenceCalls = 0;
+        chipNoCalls = 0;
+        ignoredDueToDepth = 0;
        minimumDepthForUpdate = minimumDepth;
        minimumGenotypeQuality = minGenQual;
    }
@ -57,37 +57,37 @@ class VCFConcordanceCalculator {

    public void updateTruthOnly(LocusConcordanceInfo info) {
        if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) {
-            falseNegativeLoci.add(info.getLoc());
+            falseNegativeLoci++;
        } else {
-            concordantGenotypeReferenceCalls.add(info.getLoc());
+            concordantGenotypeReferenceCalls++;
        }
    }

    public void updateFilteredLocus(LocusConcordanceInfo info) {

        if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase()) ) {
-            falseNegativeLociDueToFilters.add(info.getLoc());
+            falseNegativeLociDueToFilters++;
        }
    }


    public String toString() {
-        return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),
-                concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),nonConfidentGenotypeCalls.size(),
-                homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),
-                falseNegativeLociDueToNoCall.size(),falseNegativeLociDueToFilters.size());
+        return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth,
+                concordantGenotypeReferenceCalls,concordantHomCalls,concordantHetCalls,nonConfidentGenotypeCalls,
+                homsCalledHets,hetsCalledHoms,falsePositiveLoci,falseNegativeLoci,
+                falseNegativeLociDueToNoCall,falseNegativeLociDueToFilters);
    }

    private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
        if ( minimumDepthForUpdate > 0 && call.getReadCount() < minimumDepthForUpdate ) {
-            ignoredDueToDepth.add(loc);
+            ignoredDueToDepth++;
        } else if ( truth.isNoCall() ) {
-            chipNoCalls.add(loc);
+            chipNoCalls++;
        } else if ( truth.isVariant(( char) ref) ) {
            if ( call.isNoCall() ) {
-                falseNegativeLociDueToNoCall.add(loc);
+                falseNegativeLociDueToNoCall++;
            } else if ( ! call.isVariant( (char) ref ) ) {
-                falseNegativeLoci.add(loc);
+                falseNegativeLoci++;
            } else if ( call.isVariant((char) ref) ) {
                // check het vs hom
                checkGenotypeCall(truth,call, loc);
@ -96,9 +96,9 @@ class VCFConcordanceCalculator {
        } else if ( ! truth.isVariant( (char) ref ) ) {

            if ( call.isVariant((char) ref) ) {
-                falsePositiveLoci.add(loc);
+                falsePositiveLoci++;
            } else {
-                concordantGenotypeReferenceCalls.add(loc);
+                concordantGenotypeReferenceCalls++;
            }
        }
    }
@ -107,18 +107,18 @@ class VCFConcordanceCalculator {
        if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) {

            if ( truth.isHet() && call.isHom() ) {
-                hetsCalledHoms.add(loc);
+                hetsCalledHoms++;
            } else if ( truth.isHom() && call.isHet() ) {
-                homsCalledHets.add(loc);
+                homsCalledHets++;
            } else if (  ( truth.isHet() && call.isHet()  ) ) {
-                concordantHetCalls.add(loc);
+                concordantHetCalls++;
            } else if ( truth.isHom() && call.isHom() ) { // be extra careful
-                concordantHomCalls.add(loc);
+                concordantHomCalls++;
            }

        } else {
            
-            nonConfidentGenotypeCalls.add(loc);
+            nonConfidentGenotypeCalls++;
        }

    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java
@ -14,6 +14,10 @@ import net.sf.samtools.SAMReadGroupRecord;

 import cern.jet.stat.Probability;

+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
 /**
 * FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
 * to be contaminants (meaning a read group that's not actually associated with the sample) by searching
@ -28,8 +32,14 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
    @Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
    private Double LIMIT = 1e-9;

+    @Argument(fullName="scaleForSample", shortName="scale", doc="the scale by which the pvalue limit should reduce for testing samples directly. "+
+              "E.g. if a sample has three 1e-3 read groups, pvalue is 1e-9 -- significant; so the scale should reduce by some multiplicative factor"+
+              "For each read group associated with the sample. Defaults to 1e-4 [1e-9 for 1 RG, 1e-13 for 2 RG, 1e-17 for 3, etc]", required=false)
+    private Double SCALE = 1e-4;
+
    private UnifiedGenotyperEngine ug;
    private NamedTable altTable;
+    private final double EPSILON = 1e-20;

    public void initialize() {
        UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
@ -140,6 +150,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
        //out.println("readgroup\tpvalue\tstatus\tbalances");
        out.printf("%-10s\t%-13s\t%-10s\t%-10s%n", "readgroup", "pvalue", "status", "balances");

+        HashMap<String,Double> pvalByReadGroup = new HashMap<String,Double>();
        for (String rg : altTable.getRowNames()) {
            String balances = "";

@ -179,6 +190,8 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte

            // Compute pValue
            double pValue = Probability.studentT(dof, t);
+            pValue = pValue < EPSILON ? EPSILON : pValue;
+            pvalByReadGroup.put(rg,pValue);

            //out.printf("%s\t%e\t%s\t[%s]\n", rg, pValue, (pValue < LIMIT ? "aberrant" : "nominal"), balances);
            out.printf("%-10s\t%-13s\t%-10s\t[%-10s]\n",
@ -186,6 +199,37 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
                       String.format("%e", pValue),
                       (pValue < LIMIT ? "aberrant" : "nominal"),
                       balances);
+
+            logger.debug(rg);
+        }
+
+        out.printf("%n%n%s%n","SECTION ON BADLY CONTAMINATED SAMPLES");
+        out.printf("%s\t%s\t%s\t%s%n","sample","p-value","status","info");
+
+        HashMap<String,List<String>> samplesToReadGroups = new HashMap<String,List<String>>();
+        for ( SAMReadGroupRecord rec : getToolkit().getSAMFileHeader().getReadGroups() ) {
+            if ( samplesToReadGroups.containsKey(rec.getSample()) ) {
+                samplesToReadGroups.get(rec.getSample()).add(rec.getReadGroupId());
+            } else {
+                ArrayList<String> newList = new ArrayList<String>();
+                newList.add(rec.getReadGroupId());
+                samplesToReadGroups.put(rec.getSample(),newList);
+            }
+        }
+
+        for ( String sample : samplesToReadGroups.keySet() ) {
+            double p_value = 1;
+            double limit = LIMIT;
+            boolean containsAberrantReads = false;
+            for ( String rg : samplesToReadGroups.get(sample) ) {
+                double rg_pval = ( pvalByReadGroup.get(rg) == null ? 1 : pvalByReadGroup.get(rg) );
+                p_value = p_value*rg_pval;
+                containsAberrantReads = containsAberrantReads || rg_pval < LIMIT;
+                limit = limit*SCALE;
+                logger.debug(rg);
+            }
+
+            out.printf("%s\t%-13s\t%s\t%s%n", sample, String.format("%e",p_value), ( p_value < limit ? "aberrant" : "nominal"), ( containsAberrantReads ? "contains_aberrant_RG" : "no_aberrant_RG"));
        }
    }
 }