From a679bdde189dc77229cd629d935266a4be3c3f04 Mon Sep 17 00:00:00 2001
From: kiran <kiran@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sun, 8 Nov 2009 16:36:39 +0000
Subject: [PATCH] FindContaminatingReadGroupsWalker lists read groups in a
 single-sample BAM file that appear to be contaminants by searching for
 evidence of systematic underperformance at likely homozygous-variant sites.

Procedure:
1. Sites that are likely homozygous-variant but are called as heterozygous are identified.
2. For each site and read group, we compute the proportion of bases in the pileup supporting an alternate allele.
3. A one-sample, left-tailed t-test is performed with the null hypothesis being that the alternate allele distribution has a mean of 0.95 and the alternate hypothesis being that the true mean is statistically significantly less than expected (pValue < 1e-9).


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1989 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../FindContaminatingReadGroupsWalker.java    | 209 ++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java

diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java
new file mode 100755
index 000000000..c3ec7c315
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java
@@ -0,0 +1,209 @@
+package org.broadinstitute.sting.playground.gatk.walkers.contamination;
+
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.utils.genotype.Genotype;
+import org.broadinstitute.sting.utils.genotype.GenotypeLocusData;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.ReadBackedPileup;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.playground.utils.NamedTable;
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMReadGroupRecord;
+
+import java.util.List;
+
+import cern.jet.stat.Probability;
+
+/**
+ * FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
+ * to be contaminants by searching for evidence of systematic underperformance at likely
+ * homozygous-variant sites.  First, sites that are likely homozygous-variant but are called
+ * as heterozygous are identified.  Next, per each site and read group, we compute the proportion
+ * of bases in the pileup supporting an alternate allele.  Finally, a one-sample, left-tailed
+ * t-test is performed with the null hypothesis being that the alternate allele distribution has
+ * a mean of 0.95 and the alternate hypothesis being that the true mean is statistically
+ * significantly less than expected.
+ *
+ * @author Kiran Garimella
+ */
+public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Integer> {
+    @Argument(fullName="verbose", shortName="V", doc="Prints information for all loci, not just the suspected contaminating read groups", required=false)
+    private Boolean VERBOSE = false;
+
+    @Argument(fullName="balance", shortName="bal", doc="The expected alternate allele balance for homozygous-variant sites", required=false)
+    private Double BALANCE = 0.95;
+
+    @Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
+    private Double LIMIT = 1e-9;
+
+    private UnifiedArgumentCollection uac;
+    private UnifiedGenotyper ug;
+    private NamedTable altTable;
+
+    public void initialize() {
+        uac = new UnifiedArgumentCollection();
+        uac.genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
+        uac.CONFIDENCE_THRESHOLD = 50;
+
+        ug = new UnifiedGenotyper();
+        ug.initialize();
+        ug.setUnifiedArgumentCollection(uac);
+
+        altTable = new NamedTable();
+    }
+
+    /**
+     * Identify likely homozygous-variant sites that are called as
+     * heterozygous, so that we can isolate our inspection to these sites.
+     *
+     * @param tracker  the meta-data tracker
+     * @param ref      information regarding the reference
+     * @param context  information regarding the reads
+     * @return true if this site is a suspicious het, false if otherwise
+     */
+    public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        int altCount = 0;
+        int totalCount = 0;
+
+        ReadBackedPileup pileup = new ReadBackedPileup(ref.getBase(), context);
+        int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
+
+        for (byte base : pileup.getBases().getBytes()) {
+            int baseIndex = BaseUtils.simpleBaseToBaseIndex((char) base);
+
+            if (baseIndex != refIndex) {
+                altCount++;
+            }
+            totalCount++;
+        }
+
+        double altBalance = ((double) altCount)/((double) totalCount);
+
+        if (altBalance > 0.70) {
+            Pair<List<Genotype>, GenotypeLocusData> ugResult = ug.map(tracker, ref, context);
+
+            if (ugResult != null && ugResult.first != null) {
+                return ugResult.first.get(0).isHet();
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * For each read group represented in the pileup, determine the fraction of bases supporting the alternate allele
+     *
+     * @param tracker  the meta-data tracker
+     * @param ref      information regarding the reference
+     * @param context  information regarding the reads
+     * @return 1
+     */
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        NamedTable alleleCounts = new NamedTable();
+
+        int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
+        String colName = String.format("%s.%d", context.getContig(), context.getPosition());
+
+        for (int i = 0; i < context.numReads(); i++) {
+            SAMRecord read = context.getReads().get(i);
+            int offset = context.getOffsets().get(i);
+
+            SAMReadGroupRecord rg = read.getReadGroup();
+            int alleleIndex = BaseUtils.simpleBaseToBaseIndex((char) read.getReadBases()[offset]);
+
+            alleleCounts.increment(rg.getReadGroupId(), (alleleIndex == refIndex) ? "ref" : "alt");
+        }
+
+        for (String rg : alleleCounts.getRowNames()) {
+            double altCount = alleleCounts.get(rg, "alt");
+            double refCount = alleleCounts.get(rg, "ref");
+
+            altTable.set(rg, colName, altCount / (altCount + refCount));
+        }
+
+        return 1;
+    }
+
+    /**
+     * Provide an initial value for reduce computations.
+     *
+     * @return Initial value of reduce.
+     */
+    public Integer reduceInit() {
+        return null;
+    }
+
+    /**
+     * Reduces a single map with the accumulator provided as the ReduceType.
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return accumulator with result of the map taken into account.
+     */
+    public Integer reduce(Integer value, Integer sum) {
+        return null;
+    }
+
+    /**
+     * Perform the t-test and list the read groups that are significant underperformers.
+     *
+     * @param result  the number of suspicious sites we're inspecting (this argument is ignored)
+     */
+    public void onTraversalDone(Integer result) {
+        if (VERBOSE) { out.println("#readgroup\tpvalue\tbalances"); }
+
+        for (String rg : altTable.getRowNames()) {
+            String balances = "";
+
+            // Compute mean
+            double sum = 0.0, total = 0.0;
+
+            for (String locus : altTable.getColumnNames()) {
+                double value = altTable.get(rg, locus);
+
+                sum += value;
+                total += 1.0;
+
+                balances += String.format("%2.2f,", value);
+            }
+
+            double mean = sum/total;
+
+            // Compute stdev
+            double squareSumOfMeanDifferences = 0.0;
+
+            for (String locus : altTable.getColumnNames()) {
+                double value = altTable.get(rg, locus);
+
+                squareSumOfMeanDifferences += Math.pow(value - mean, 2.0);
+            }
+
+            double stdev = Math.sqrt(squareSumOfMeanDifferences/total);
+
+            // Compute standard error of the mean (SEM)
+            double sem = stdev/Math.sqrt(total);
+
+            // Compute test statistic t
+            double t = (mean - BALANCE) / sem;
+
+            // Degrees of freedom
+            double dof = total - 1.0;
+
+            // Compute pValue
+            double pValue = Probability.studentT(dof, t);
+
+            if (pValue < LIMIT) {
+                out.printf("%s\t%e\t[%s]\n", rg, pValue, balances);
+            } else {
+                if (VERBOSE) { out.printf("#%s\t%e\t[%s]\n", rg, pValue, balances); }
+            }
+        }
+    }
+}