FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear to be contaminants by searching for evidence of systematic underperformance at likely homozygous-variant sites.

Procedure:
1. Sites that are likely homozygous-variant but are called as heterozygous are identified.
2. For each site and read group, we compute the proportion of bases in the pileup supporting an alternate allele.
3. A one-sample, left-tailed t-test is performed with the null hypothesis being that the alternate allele distribution has a mean of 0.95 and the alternate hypothesis being that the true mean is statistically significantly less than expected (pValue < 1e-9).



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1989 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-11-08 16:36:39 +00:00
parent 2225d8176e
commit a679bdde18
1 changed files with 209 additions and 0 deletions

View File

@ -0,0 +1,209 @@
package org.broadinstitute.sting.playground.gatk.walkers.contamination;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.GenotypeLocusData;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.utils.NamedTable;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import java.util.List;
import cern.jet.stat.Probability;
/**
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
* to be contaminants by searching for evidence of systematic underperformance at likely
* homozygous-variant sites. First, sites that are likely homozygous-variant but are called
* as heterozygous are identified. Next, per each site and read group, we compute the proportion
* of bases in the pileup supporting an alternate allele. Finally, a one-sample, left-tailed
* t-test is performed with the null hypothesis being that the alternate allele distribution has
* a mean of 0.95 and the alternate hypothesis being that the true mean is statistically
* significantly less than expected.
*
* @author Kiran Garimella
*/
public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="verbose", shortName="V", doc="Prints information for all loci, not just the suspected contaminating read groups", required=false)
private Boolean VERBOSE = false;
@Argument(fullName="balance", shortName="bal", doc="The expected alternate allele balance for homozygous-variant sites", required=false)
private Double BALANCE = 0.95;
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
private Double LIMIT = 1e-9;
private UnifiedArgumentCollection uac;
private UnifiedGenotyper ug;
private NamedTable altTable;
public void initialize() {
uac = new UnifiedArgumentCollection();
uac.genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
uac.CONFIDENCE_THRESHOLD = 50;
ug = new UnifiedGenotyper();
ug.initialize();
ug.setUnifiedArgumentCollection(uac);
altTable = new NamedTable();
}
/**
* Identify likely homozygous-variant sites that are called as
* heterozygous, so that we can isolate our inspection to these sites.
*
* @param tracker the meta-data tracker
* @param ref information regarding the reference
* @param context information regarding the reads
* @return true if this site is a suspicious het, false if otherwise
*/
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
int altCount = 0;
int totalCount = 0;
ReadBackedPileup pileup = new ReadBackedPileup(ref.getBase(), context);
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
for (byte base : pileup.getBases().getBytes()) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex((char) base);
if (baseIndex != refIndex) {
altCount++;
}
totalCount++;
}
double altBalance = ((double) altCount)/((double) totalCount);
if (altBalance > 0.70) {
Pair<List<Genotype>, GenotypeLocusData> ugResult = ug.map(tracker, ref, context);
if (ugResult != null && ugResult.first != null) {
return ugResult.first.get(0).isHet();
}
}
return false;
}
/**
* For each read group represented in the pileup, determine the fraction of bases supporting the alternate allele
*
* @param tracker the meta-data tracker
* @param ref information regarding the reference
* @param context information regarding the reads
* @return 1
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
NamedTable alleleCounts = new NamedTable();
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
String colName = String.format("%s.%d", context.getContig(), context.getPosition());
for (int i = 0; i < context.numReads(); i++) {
SAMRecord read = context.getReads().get(i);
int offset = context.getOffsets().get(i);
SAMReadGroupRecord rg = read.getReadGroup();
int alleleIndex = BaseUtils.simpleBaseToBaseIndex((char) read.getReadBases()[offset]);
alleleCounts.increment(rg.getReadGroupId(), (alleleIndex == refIndex) ? "ref" : "alt");
}
for (String rg : alleleCounts.getRowNames()) {
double altCount = alleleCounts.get(rg, "alt");
double refCount = alleleCounts.get(rg, "ref");
altTable.set(rg, colName, altCount / (altCount + refCount));
}
return 1;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return null;
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return null;
}
/**
* Perform the t-test and list the read groups that are significant underperformers.
*
* @param result the number of suspicious sites we're inspecting (this argument is ignored)
*/
public void onTraversalDone(Integer result) {
if (VERBOSE) { out.println("#readgroup\tpvalue\tbalances"); }
for (String rg : altTable.getRowNames()) {
String balances = "";
// Compute mean
double sum = 0.0, total = 0.0;
for (String locus : altTable.getColumnNames()) {
double value = altTable.get(rg, locus);
sum += value;
total += 1.0;
balances += String.format("%2.2f,", value);
}
double mean = sum/total;
// Compute stdev
double squareSumOfMeanDifferences = 0.0;
for (String locus : altTable.getColumnNames()) {
double value = altTable.get(rg, locus);
squareSumOfMeanDifferences += Math.pow(value - mean, 2.0);
}
double stdev = Math.sqrt(squareSumOfMeanDifferences/total);
// Compute standard error of the mean (SEM)
double sem = stdev/Math.sqrt(total);
// Compute test statistic t
double t = (mean - BALANCE) / sem;
// Degrees of freedom
double dof = total - 1.0;
// Compute pValue
double pValue = Probability.studentT(dof, t);
if (pValue < LIMIT) {
out.printf("%s\t%e\t[%s]\n", rg, pValue, balances);
} else {
if (VERBOSE) { out.printf("#%s\t%e\t[%s]\n", rg, pValue, balances); }
}
}
}
}