Initial implementation of evaluator to compute popgen theta statistics
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4004 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0a8ebcb4f9
commit
1dc373b9d0
|
|
@ -0,0 +1,136 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.playground.utils.report.tags.Analysis;
|
||||||
|
import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
@Analysis(name = "Theta Variant Evaluator", description = "Computes different estimates of theta based on variant sites and genotypes")
|
||||||
|
public class ThetaVariantEvaluator extends VariantEvaluator {
|
||||||
|
|
||||||
|
@DataPoint(name = "avg_heterozygosity", description = "Average heterozygosity at variant sites; note that missing genotypes are ignored when computing this value")
|
||||||
|
double avgHet = 0.0;
|
||||||
|
@DataPoint(name = "avg_pairwise_diffs", description = "Average pairwise differences at aligned sequences; averaged over both number of sequeneces and number of variant sites; note that missing genotypes are ignored when computing this value")
|
||||||
|
double avgAvgDiffs = 0.0;
|
||||||
|
@DataPoint(name = "sum_heterozygosity", description = "Sum of heterozygosity over all variant sites; divide this by total target to get estimate of per base theta")
|
||||||
|
double totalHet = 0.0;
|
||||||
|
@DataPoint(name = "sum_pairwise_diffs", description = "Sum of pairwise diffs over all variant sites; divide this by total target to get estimate of per base theta")
|
||||||
|
double totalAvgDiffs = 0.0;
|
||||||
|
@DataPoint(name = "theta_region_num_sites", description = "Theta for entire region estimated based on number of segregating sites; divide ths by total target to get estimate of per base theta")
|
||||||
|
double thetaRegionNumSites = 0.0;
|
||||||
|
|
||||||
|
//helper variables
|
||||||
|
double numSites = 0;
|
||||||
|
|
||||||
|
public ThetaVariantEvaluator(VariantEvalWalker parent) {
|
||||||
|
super(parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean enabled() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return "theta";
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getComparisonOrder() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
|
if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) {
|
||||||
|
return null; //no interesting sites
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vc.hasGenotypes()) {
|
||||||
|
|
||||||
|
//this maps allele to a count
|
||||||
|
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
|
||||||
|
|
||||||
|
int numHetsHere = 0;
|
||||||
|
float numGenosHere = 0;
|
||||||
|
int numIndsHere = 0;
|
||||||
|
|
||||||
|
for (Genotype genotype : vc.getGenotypes().values()) {
|
||||||
|
numIndsHere++;
|
||||||
|
if (!genotype.isNoCall()) {
|
||||||
|
//increment stats for heterozygosity
|
||||||
|
if (genotype.isHet()) {
|
||||||
|
numHetsHere++;
|
||||||
|
}
|
||||||
|
|
||||||
|
numGenosHere++;
|
||||||
|
//increment stats for pairwise mismatches
|
||||||
|
|
||||||
|
for (Allele allele : genotype.getAlleles()) {
|
||||||
|
if (allele.isNonNull() && allele.isCalled()) {
|
||||||
|
String alleleString = allele.toString();
|
||||||
|
alleleCounts.putIfAbsent(alleleString, 0);
|
||||||
|
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (numGenosHere > 0) {
|
||||||
|
//only if have one called genotype at least
|
||||||
|
this.numSites++;
|
||||||
|
|
||||||
|
this.totalHet += numHetsHere / numGenosHere;
|
||||||
|
|
||||||
|
//compute based on num sites
|
||||||
|
float harmonicFactor = 0;
|
||||||
|
for (int i = 1; i <= numIndsHere; i++) {
|
||||||
|
harmonicFactor += 1.0 / i;
|
||||||
|
}
|
||||||
|
this.thetaRegionNumSites += 1.0 / harmonicFactor;
|
||||||
|
|
||||||
|
//now compute pairwise mismatches
|
||||||
|
float numPairwise = 0;
|
||||||
|
float numDiffs = 0;
|
||||||
|
for (String allele1 : alleleCounts.keySet()) {
|
||||||
|
int allele1Count = alleleCounts.get(allele1);
|
||||||
|
|
||||||
|
for (String allele2 : alleleCounts.keySet()) {
|
||||||
|
if (allele1.compareTo(allele2) < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (allele1 .compareTo(allele2) == 0) {
|
||||||
|
numPairwise += allele1Count * (allele1Count - 1) * .5;
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int allele2Count = alleleCounts.get(allele2);
|
||||||
|
numPairwise += allele1Count * allele2Count;
|
||||||
|
numDiffs += allele1Count * allele2Count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numPairwise > 0) {
|
||||||
|
this.totalAvgDiffs += numDiffs / numPairwise;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finalizeEvaluation() {
|
||||||
|
|
||||||
|
if (this.numSites > 0) {
|
||||||
|
|
||||||
|
this.avgHet = this.totalHet / this.numSites;
|
||||||
|
this.avgAvgDiffs = this.totalAvgDiffs / this.numSites;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue