FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear to be contaminants by searching for evidence of systematic underperformance at likely homozygous-variant sites.
Procedure: 1. Sites that are likely homozygous-variant but are called as heterozygous are identified. 2. For each site and read group, we compute the proportion of bases in the pileup supporting an alternate allele. 3. A one-sample, left-tailed t-test is performed with the null hypothesis being that the alternate allele distribution has a mean of 0.95 and the alternate hypothesis being that the true mean is statistically significantly less than expected (pValue < 1e-9). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1989 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2225d8176e
commit
a679bdde18
|
|
@ -0,0 +1,209 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.contamination;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeLocusData;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.playground.utils.NamedTable;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import cern.jet.stat.Probability;
|
||||
|
||||
/**
|
||||
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
|
||||
* to be contaminants by searching for evidence of systematic underperformance at likely
|
||||
* homozygous-variant sites. First, sites that are likely homozygous-variant but are called
|
||||
* as heterozygous are identified. Next, per each site and read group, we compute the proportion
|
||||
* of bases in the pileup supporting an alternate allele. Finally, a one-sample, left-tailed
|
||||
* t-test is performed with the null hypothesis being that the alternate allele distribution has
|
||||
* a mean of 0.95 and the alternate hypothesis being that the true mean is statistically
|
||||
* significantly less than expected.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Integer> {
|
||||
@Argument(fullName="verbose", shortName="V", doc="Prints information for all loci, not just the suspected contaminating read groups", required=false)
|
||||
private Boolean VERBOSE = false;
|
||||
|
||||
@Argument(fullName="balance", shortName="bal", doc="The expected alternate allele balance for homozygous-variant sites", required=false)
|
||||
private Double BALANCE = 0.95;
|
||||
|
||||
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
|
||||
private Double LIMIT = 1e-9;
|
||||
|
||||
private UnifiedArgumentCollection uac;
|
||||
private UnifiedGenotyper ug;
|
||||
private NamedTable altTable;
|
||||
|
||||
public void initialize() {
|
||||
uac = new UnifiedArgumentCollection();
|
||||
uac.genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
||||
uac.CONFIDENCE_THRESHOLD = 50;
|
||||
|
||||
ug = new UnifiedGenotyper();
|
||||
ug.initialize();
|
||||
ug.setUnifiedArgumentCollection(uac);
|
||||
|
||||
altTable = new NamedTable();
|
||||
}
|
||||
|
||||
/**
|
||||
* Identify likely homozygous-variant sites that are called as
|
||||
* heterozygous, so that we can isolate our inspection to these sites.
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref information regarding the reference
|
||||
* @param context information regarding the reads
|
||||
* @return true if this site is a suspicious het, false if otherwise
|
||||
*/
|
||||
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
int altCount = 0;
|
||||
int totalCount = 0;
|
||||
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref.getBase(), context);
|
||||
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
|
||||
for (byte base : pileup.getBases().getBytes()) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex((char) base);
|
||||
|
||||
if (baseIndex != refIndex) {
|
||||
altCount++;
|
||||
}
|
||||
totalCount++;
|
||||
}
|
||||
|
||||
double altBalance = ((double) altCount)/((double) totalCount);
|
||||
|
||||
if (altBalance > 0.70) {
|
||||
Pair<List<Genotype>, GenotypeLocusData> ugResult = ug.map(tracker, ref, context);
|
||||
|
||||
if (ugResult != null && ugResult.first != null) {
|
||||
return ugResult.first.get(0).isHet();
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each read group represented in the pileup, determine the fraction of bases supporting the alternate allele
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref information regarding the reference
|
||||
* @param context information regarding the reads
|
||||
* @return 1
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
NamedTable alleleCounts = new NamedTable();
|
||||
|
||||
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
String colName = String.format("%s.%d", context.getContig(), context.getPosition());
|
||||
|
||||
for (int i = 0; i < context.numReads(); i++) {
|
||||
SAMRecord read = context.getReads().get(i);
|
||||
int offset = context.getOffsets().get(i);
|
||||
|
||||
SAMReadGroupRecord rg = read.getReadGroup();
|
||||
int alleleIndex = BaseUtils.simpleBaseToBaseIndex((char) read.getReadBases()[offset]);
|
||||
|
||||
alleleCounts.increment(rg.getReadGroupId(), (alleleIndex == refIndex) ? "ref" : "alt");
|
||||
}
|
||||
|
||||
for (String rg : alleleCounts.getRowNames()) {
|
||||
double altCount = alleleCounts.get(rg, "alt");
|
||||
double refCount = alleleCounts.get(rg, "ref");
|
||||
|
||||
altTable.set(rg, colName, altCount / (altCount + refCount));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform the t-test and list the read groups that are significant underperformers.
|
||||
*
|
||||
* @param result the number of suspicious sites we're inspecting (this argument is ignored)
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
if (VERBOSE) { out.println("#readgroup\tpvalue\tbalances"); }
|
||||
|
||||
for (String rg : altTable.getRowNames()) {
|
||||
String balances = "";
|
||||
|
||||
// Compute mean
|
||||
double sum = 0.0, total = 0.0;
|
||||
|
||||
for (String locus : altTable.getColumnNames()) {
|
||||
double value = altTable.get(rg, locus);
|
||||
|
||||
sum += value;
|
||||
total += 1.0;
|
||||
|
||||
balances += String.format("%2.2f,", value);
|
||||
}
|
||||
|
||||
double mean = sum/total;
|
||||
|
||||
// Compute stdev
|
||||
double squareSumOfMeanDifferences = 0.0;
|
||||
|
||||
for (String locus : altTable.getColumnNames()) {
|
||||
double value = altTable.get(rg, locus);
|
||||
|
||||
squareSumOfMeanDifferences += Math.pow(value - mean, 2.0);
|
||||
}
|
||||
|
||||
double stdev = Math.sqrt(squareSumOfMeanDifferences/total);
|
||||
|
||||
// Compute standard error of the mean (SEM)
|
||||
double sem = stdev/Math.sqrt(total);
|
||||
|
||||
// Compute test statistic t
|
||||
double t = (mean - BALANCE) / sem;
|
||||
|
||||
// Degrees of freedom
|
||||
double dof = total - 1.0;
|
||||
|
||||
// Compute pValue
|
||||
double pValue = Probability.studentT(dof, t);
|
||||
|
||||
if (pValue < LIMIT) {
|
||||
out.printf("%s\t%e\t[%s]\n", rg, pValue, balances);
|
||||
} else {
|
||||
if (VERBOSE) { out.printf("#%s\t%e\t[%s]\n", rg, pValue, balances); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue