FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear to be contaminants by searching for evidence of systematic underperformance at likely homozygous-variant sites.
Procedure: 1. Sites that are likely homozygous-variant but are called as heterozygous are identified. 2. For each site and read group, we compute the proportion of bases in the pileup supporting an alternate allele. 3. A one-sample, left-tailed t-test is performed with the null hypothesis being that the alternate allele distribution has a mean of 0.95 and the alternate hypothesis being that the true mean is statistically significantly less than expected (pValue < 1e-9). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1989 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2225d8176e
commit
a679bdde18
|
|
@ -0,0 +1,209 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.contamination;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.GenotypeLocusData;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.playground.utils.NamedTable;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import cern.jet.stat.Probability;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
|
||||||
|
* to be contaminants by searching for evidence of systematic underperformance at likely
|
||||||
|
* homozygous-variant sites. First, sites that are likely homozygous-variant but are called
|
||||||
|
* as heterozygous are identified. Next, per each site and read group, we compute the proportion
|
||||||
|
* of bases in the pileup supporting an alternate allele. Finally, a one-sample, left-tailed
|
||||||
|
* t-test is performed with the null hypothesis being that the alternate allele distribution has
|
||||||
|
* a mean of 0.95 and the alternate hypothesis being that the true mean is statistically
|
||||||
|
* significantly less than expected.
|
||||||
|
*
|
||||||
|
* @author Kiran Garimella
|
||||||
|
*/
|
||||||
|
public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
@Argument(fullName="verbose", shortName="V", doc="Prints information for all loci, not just the suspected contaminating read groups", required=false)
|
||||||
|
private Boolean VERBOSE = false;
|
||||||
|
|
||||||
|
@Argument(fullName="balance", shortName="bal", doc="The expected alternate allele balance for homozygous-variant sites", required=false)
|
||||||
|
private Double BALANCE = 0.95;
|
||||||
|
|
||||||
|
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
|
||||||
|
private Double LIMIT = 1e-9;
|
||||||
|
|
||||||
|
private UnifiedArgumentCollection uac;
|
||||||
|
private UnifiedGenotyper ug;
|
||||||
|
private NamedTable altTable;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
uac = new UnifiedArgumentCollection();
|
||||||
|
uac.genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
||||||
|
uac.CONFIDENCE_THRESHOLD = 50;
|
||||||
|
|
||||||
|
ug = new UnifiedGenotyper();
|
||||||
|
ug.initialize();
|
||||||
|
ug.setUnifiedArgumentCollection(uac);
|
||||||
|
|
||||||
|
altTable = new NamedTable();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Identify likely homozygous-variant sites that are called as
|
||||||
|
* heterozygous, so that we can isolate our inspection to these sites.
|
||||||
|
*
|
||||||
|
* @param tracker the meta-data tracker
|
||||||
|
* @param ref information regarding the reference
|
||||||
|
* @param context information regarding the reads
|
||||||
|
* @return true if this site is a suspicious het, false if otherwise
|
||||||
|
*/
|
||||||
|
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
int altCount = 0;
|
||||||
|
int totalCount = 0;
|
||||||
|
|
||||||
|
ReadBackedPileup pileup = new ReadBackedPileup(ref.getBase(), context);
|
||||||
|
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||||
|
|
||||||
|
for (byte base : pileup.getBases().getBytes()) {
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex((char) base);
|
||||||
|
|
||||||
|
if (baseIndex != refIndex) {
|
||||||
|
altCount++;
|
||||||
|
}
|
||||||
|
totalCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
double altBalance = ((double) altCount)/((double) totalCount);
|
||||||
|
|
||||||
|
if (altBalance > 0.70) {
|
||||||
|
Pair<List<Genotype>, GenotypeLocusData> ugResult = ug.map(tracker, ref, context);
|
||||||
|
|
||||||
|
if (ugResult != null && ugResult.first != null) {
|
||||||
|
return ugResult.first.get(0).isHet();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each read group represented in the pileup, determine the fraction of bases supporting the alternate allele
|
||||||
|
*
|
||||||
|
* @param tracker the meta-data tracker
|
||||||
|
* @param ref information regarding the reference
|
||||||
|
* @param context information regarding the reads
|
||||||
|
* @return 1
|
||||||
|
*/
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
NamedTable alleleCounts = new NamedTable();
|
||||||
|
|
||||||
|
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||||
|
String colName = String.format("%s.%d", context.getContig(), context.getPosition());
|
||||||
|
|
||||||
|
for (int i = 0; i < context.numReads(); i++) {
|
||||||
|
SAMRecord read = context.getReads().get(i);
|
||||||
|
int offset = context.getOffsets().get(i);
|
||||||
|
|
||||||
|
SAMReadGroupRecord rg = read.getReadGroup();
|
||||||
|
int alleleIndex = BaseUtils.simpleBaseToBaseIndex((char) read.getReadBases()[offset]);
|
||||||
|
|
||||||
|
alleleCounts.increment(rg.getReadGroupId(), (alleleIndex == refIndex) ? "ref" : "alt");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String rg : alleleCounts.getRowNames()) {
|
||||||
|
double altCount = alleleCounts.get(rg, "alt");
|
||||||
|
double refCount = alleleCounts.get(rg, "ref");
|
||||||
|
|
||||||
|
altTable.set(rg, colName, altCount / (altCount + refCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide an initial value for reduce computations.
|
||||||
|
*
|
||||||
|
* @return Initial value of reduce.
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return accumulator with result of the map taken into account.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform the t-test and list the read groups that are significant underperformers.
|
||||||
|
*
|
||||||
|
* @param result the number of suspicious sites we're inspecting (this argument is ignored)
|
||||||
|
*/
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
if (VERBOSE) { out.println("#readgroup\tpvalue\tbalances"); }
|
||||||
|
|
||||||
|
for (String rg : altTable.getRowNames()) {
|
||||||
|
String balances = "";
|
||||||
|
|
||||||
|
// Compute mean
|
||||||
|
double sum = 0.0, total = 0.0;
|
||||||
|
|
||||||
|
for (String locus : altTable.getColumnNames()) {
|
||||||
|
double value = altTable.get(rg, locus);
|
||||||
|
|
||||||
|
sum += value;
|
||||||
|
total += 1.0;
|
||||||
|
|
||||||
|
balances += String.format("%2.2f,", value);
|
||||||
|
}
|
||||||
|
|
||||||
|
double mean = sum/total;
|
||||||
|
|
||||||
|
// Compute stdev
|
||||||
|
double squareSumOfMeanDifferences = 0.0;
|
||||||
|
|
||||||
|
for (String locus : altTable.getColumnNames()) {
|
||||||
|
double value = altTable.get(rg, locus);
|
||||||
|
|
||||||
|
squareSumOfMeanDifferences += Math.pow(value - mean, 2.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
double stdev = Math.sqrt(squareSumOfMeanDifferences/total);
|
||||||
|
|
||||||
|
// Compute standard error of the mean (SEM)
|
||||||
|
double sem = stdev/Math.sqrt(total);
|
||||||
|
|
||||||
|
// Compute test statistic t
|
||||||
|
double t = (mean - BALANCE) / sem;
|
||||||
|
|
||||||
|
// Degrees of freedom
|
||||||
|
double dof = total - 1.0;
|
||||||
|
|
||||||
|
// Compute pValue
|
||||||
|
double pValue = Probability.studentT(dof, t);
|
||||||
|
|
||||||
|
if (pValue < LIMIT) {
|
||||||
|
out.printf("%s\t%e\t[%s]\n", rg, pValue, balances);
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) { out.printf("#%s\t%e\t[%s]\n", rg, pValue, balances); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue