Restructured AlleleFrequencyMetricsWalker to correctly report Hapmap concordance numbers for genotyping and added reporting for Hapmap reference/variant calling. Also, tiny bugfix in interval code.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@181 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-03-25 01:12:05 +00:00
parent 9e2a373184
commit 30babbf5b9
4 changed files with 185 additions and 98 deletions

View File

@ -597,7 +597,7 @@ public class TraversalEngine {
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
// location
if ( ( interval == null && inLocations(locus.getLocation()) ) || interval.overlapsP(locus.getLocation()) ) {
if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) {
//System.out.format("Working at %s\n", locus.getLocation().toString());

View File

@ -26,8 +26,10 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
long num_loci_total=0;
long num_loci_confident=0;
double LOD_cutoff = 5;
long hapmap_tp=0;
long hapmap_fp=0;
long hapmap_genotype_correct = 0;
long hapmap_genotype_incorrect = 0;
long hapmap_refvar_correct = 0;
long hapmap_refvar_incorrect = 0;
AlleleFrequencyWalker caller;
@ -37,70 +39,139 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
num_loci_total += 1;
boolean is_dbSNP_SNP = false;
boolean has_hapmap_chip_genotype = false;
rodGFF hapmap_chip_genotype = null;
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP )
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
if ( datum instanceof rodGFF )
{
has_hapmap_chip_genotype = true;
hapmap_chip_genotype = (rodGFF)datum;
}
}
}
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff)
if (alleleFreq.qstar > 0.0 && alleleFreq.LOD >= LOD_cutoff)
{
// Confident variant.
num_variants += 1;
boolean is_dbSNP_SNP = false;
boolean hapmap_hit = false;
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP )
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
if ( datum instanceof rodGFF )
{
rodGFF hapmap = (rodGFF) datum;
String hapmap_genotype = hapmap.getFeature();
String called_genotype = alleleFreq.asString();
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
// There is a hapmap site here, is it correct?
if (hapmap_genotype.equals(called_genotype))
{
hapmap_tp++;
} else {
hapmap_fp++;
}
}
}
}
if (is_dbSNP_SNP)
{
dbsnp_hits += 1;
}
}
// Convert qstar and LOD to give best qstar and a positive LOD for that qstar
double qstar;
double LOD;
if (alleleFreq.LOD < 0) {
qstar = 0.0;
LOD = -1 * alleleFreq.LOD;
}else{
qstar = alleleFreq.qstar;
LOD = alleleFreq.LOD;
}
//long confident_calls_at_hapmap_chip_sites = 0;
if (LOD >= LOD_cutoff && has_hapmap_chip_genotype) {
//confident_calls_at_hapmap_chip_sites++;
// convert hapmap call to mixture of ref/nonref
String hapmap_genotype = hapmap_chip_genotype.getFeature();
long refs=0, alts=0;
double hapmap_q;
for (char c : hapmap_genotype.toCharArray()) {
if (c == alleleFreq.ref) { refs++; }
if (c == alleleFreq.alt) { alts++; }
}
if (refs+alts > 0) {
hapmap_q = (float)alts / (refs+alts);
}else{
hapmap_q = -1;
}
// Hapmap debug info
System.out.format("HAPMAP %.2f %.2f %.2f ", hapmap_q, qstar, LOD);
String called_genotype = alleleFreq.asString();
System.out.format("%s %s %c %c %.2f", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt, alleleFreq.LOD);
// Calculate genotyping performance - did we get the correct genotype of the N+1 choices?
if (hapmap_q != -1 && hapmap_q == qstar) {
hapmap_genotype_correct++;
System.out.print("\n");
}else{
hapmap_genotype_incorrect++;
System.out.printf(" INCORRECT GENOTYPE Bases: %s\n", AlleleFrequencyWalker.getBases(context));
AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
}
// Now calculate ref / var performance - did we correctly classify the site as
// reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here
boolean hapmap_var = hapmap_q != 0.0;
boolean called_var = qstar != 0.0;
if (hapmap_var != called_var) {
hapmap_refvar_incorrect++;
}else{
hapmap_refvar_correct++;
}
}
/*String hapmap_genotype = hapmap_chip_genotype.getFeature();
String called_genotype = alleleFreq.asString();
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.LOD);
// There is a hapmap site here, is it correct?
if (hapmap_genotype.equals(called_genotype))
{
hapmap_tp++;
} else {
hapmap_fp++;
} */
return alleleFreq;
}
public void printMetrics()
public void printMetrics()
{
if (num_loci_total == 0) { return; }
System.out.printf("\n");
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
if (num_variants != 0)
{
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp);
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp);
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp);
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp));
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS -- Hapmap Genotyping performance --\n");
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS -- Hapmap Reference/Variant performance --\n");
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
}
System.out.println();
}
@ -118,17 +189,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5))
{
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
// Print RESULT data for confident calls
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
if (this.num_loci_total % 10000 == 0) { printMetrics(); }

View File

@ -12,56 +12,35 @@ import java.util.Arrays;
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstimate, Integer> {
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
// Convert context data into bases and 4-base quals
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
// Set number of chromosomes, N, to 2 for now
int N = 2;
// Convert bases to CharArray
int numReads = context.getReads().size(); //numReads();
byte[] bases = new byte[numReads];
String base_string = "";
double[][] quals = new double[numReads][4];
int refnum = nuc2num[ref];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases[i] = read.getReadBases()[offset];
base_string += read.getReadString().charAt(offset);
int callednum = nuc2num[bases[i]];
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
for (int b=0; b<4; b++)
if (b != callednum)
quals[i][b] = nonref_quals;
}
// Convert context data into bases and 4-base quals
String bases = getBases(context);
double quals[][] = getOneBaseQuals(context);
// Count bases
int[] base_counts = new int[4];
for (byte b : bases)
for (byte b : bases.getBytes())
base_counts[nuc2num[b]]++;
// Find alternate allele - 2nd most frequent non-ref allele
// (maybe we should check for ties and eval both or check most common including quality scores)
int altnum = -1; // start with first base numerical identity (0,1,2,3)
double altcount = -1; // start with first base count
int refnum = nuc2num[ref];
for (int b=0; b<4; b++) {
if ((b != refnum) && (base_counts[b] > altcount)) {
altnum = b; altcount = base_counts[b];
altnum = b; altcount = base_counts[b];
}
}
assert(altnum != -1);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length());
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
// Print dbSNP data if its there
if (false) {
@ -76,6 +55,47 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
return alleleFreq;
}
static public String getBases (LocusContext context) {
// Convert bases to CharArray
int numReads = context.getReads().size(); //numReads();
//byte[] bases = new byte[numReads];
String base_string = "";
//int refnum = nuc2num[ref];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
//bases[i] = read.getReadBases()[offset];
base_string += read.getReadString().charAt(offset);
}
return base_string;
}
static public double[][] getOneBaseQuals (LocusContext context) {
int numReads = context.getReads().size(); //numReads();
double[][] quals = new double[numReads][4];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
int callednum = nuc2num[read.getReadBases()[offset]];
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
for (int b=0; b<4; b++)
if (b != callednum)
quals[i][b] = nonref_quals;
}
return quals;
}
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
{
@ -114,7 +134,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
double pG = P_G(N, qstar_N);
double posterior = pDq + pqG + pG;
if (posterior > best_posterior)
if (posterior > best_posterior)
{
best_qstar = qstar;
best_qhat = q;
@ -124,6 +144,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
best_pqG = pqG;
best_pG = pG;
}
//System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior);
}
}
@ -230,17 +251,9 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
{
if (alleleFreq.LOD >= 5)
{
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
// Print RESULT data for confident calls
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
return 0;
}
@ -267,9 +280,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
void print_base_qual_matrix(double [][]quals, int numReads) {
static void print_base_qual_matrix(double [][]quals) {
// Print quals for debugging
System.out.println("Base quality matrix");
int numReads = quals.length;
for (int b=0; b<4; b++) {
System.out.print(num2nuc[b]);
for (int i=0; i < numReads; i++){

View File

@ -13,7 +13,7 @@ public class AlleleFrequencyEstimate {
public double LOD;
public int depth;
public double getQstar()
public double getQstar()
{
return qstar;
}
@ -35,6 +35,17 @@ public class AlleleFrequencyEstimate {
this.depth = depth;
}
public String asTabularString() {
return String.format("RESULT %s %c %c %f %f %f %d\n",
location,
ref,
alt,
qhat,
qstar,
LOD,
depth);
}
public String asString() {
// Print out the called bases
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref