Initial implementation of evaluator to compute popgen theta statistics
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4004 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0a8ebcb4f9
commit
1dc373b9d0
|
|
@ -0,0 +1,136 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.playground.utils.report.tags.Analysis;
|
||||
import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
@Analysis(name = "Theta Variant Evaluator", description = "Computes different estimates of theta based on variant sites and genotypes")
|
||||
public class ThetaVariantEvaluator extends VariantEvaluator {
|
||||
|
||||
@DataPoint(name = "avg_heterozygosity", description = "Average heterozygosity at variant sites; note that missing genotypes are ignored when computing this value")
|
||||
double avgHet = 0.0;
|
||||
@DataPoint(name = "avg_pairwise_diffs", description = "Average pairwise differences at aligned sequences; averaged over both number of sequeneces and number of variant sites; note that missing genotypes are ignored when computing this value")
|
||||
double avgAvgDiffs = 0.0;
|
||||
@DataPoint(name = "sum_heterozygosity", description = "Sum of heterozygosity over all variant sites; divide this by total target to get estimate of per base theta")
|
||||
double totalHet = 0.0;
|
||||
@DataPoint(name = "sum_pairwise_diffs", description = "Sum of pairwise diffs over all variant sites; divide this by total target to get estimate of per base theta")
|
||||
double totalAvgDiffs = 0.0;
|
||||
@DataPoint(name = "theta_region_num_sites", description = "Theta for entire region estimated based on number of segregating sites; divide ths by total target to get estimate of per base theta")
|
||||
double thetaRegionNumSites = 0.0;
|
||||
|
||||
//helper variables
|
||||
double numSites = 0;
|
||||
|
||||
public ThetaVariantEvaluator(VariantEvalWalker parent) {
|
||||
super(parent);
|
||||
}
|
||||
|
||||
public boolean enabled() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "theta";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) {
|
||||
return null; //no interesting sites
|
||||
}
|
||||
|
||||
if (vc.hasGenotypes()) {
|
||||
|
||||
//this maps allele to a count
|
||||
ConcurrentMap<String, Integer> alleleCounts = new ConcurrentHashMap<String, Integer>();
|
||||
|
||||
int numHetsHere = 0;
|
||||
float numGenosHere = 0;
|
||||
int numIndsHere = 0;
|
||||
|
||||
for (Genotype genotype : vc.getGenotypes().values()) {
|
||||
numIndsHere++;
|
||||
if (!genotype.isNoCall()) {
|
||||
//increment stats for heterozygosity
|
||||
if (genotype.isHet()) {
|
||||
numHetsHere++;
|
||||
}
|
||||
|
||||
numGenosHere++;
|
||||
//increment stats for pairwise mismatches
|
||||
|
||||
for (Allele allele : genotype.getAlleles()) {
|
||||
if (allele.isNonNull() && allele.isCalled()) {
|
||||
String alleleString = allele.toString();
|
||||
alleleCounts.putIfAbsent(alleleString, 0);
|
||||
alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numGenosHere > 0) {
|
||||
//only if have one called genotype at least
|
||||
this.numSites++;
|
||||
|
||||
this.totalHet += numHetsHere / numGenosHere;
|
||||
|
||||
//compute based on num sites
|
||||
float harmonicFactor = 0;
|
||||
for (int i = 1; i <= numIndsHere; i++) {
|
||||
harmonicFactor += 1.0 / i;
|
||||
}
|
||||
this.thetaRegionNumSites += 1.0 / harmonicFactor;
|
||||
|
||||
//now compute pairwise mismatches
|
||||
float numPairwise = 0;
|
||||
float numDiffs = 0;
|
||||
for (String allele1 : alleleCounts.keySet()) {
|
||||
int allele1Count = alleleCounts.get(allele1);
|
||||
|
||||
for (String allele2 : alleleCounts.keySet()) {
|
||||
if (allele1.compareTo(allele2) < 0) {
|
||||
continue;
|
||||
}
|
||||
if (allele1 .compareTo(allele2) == 0) {
|
||||
numPairwise += allele1Count * (allele1Count - 1) * .5;
|
||||
|
||||
}
|
||||
else {
|
||||
int allele2Count = alleleCounts.get(allele2);
|
||||
numPairwise += allele1Count * allele2Count;
|
||||
numDiffs += allele1Count * allele2Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numPairwise > 0) {
|
||||
this.totalAvgDiffs += numDiffs / numPairwise;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finalizeEvaluation() {
|
||||
|
||||
if (this.numSites > 0) {
|
||||
|
||||
this.avgHet = this.totalHet / this.numSites;
|
||||
this.avgAvgDiffs = this.totalAvgDiffs / this.numSites;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue