Restructured AlleleFrequencyMetricsWalker to correctly report Hapmap concordance numbers for genotyping and added reporting for Hapmap reference/variant calling. Also, tiny bugfix in interval code.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@181 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-03-25 01:12:05 +00:00
parent 9e2a373184
commit 30babbf5b9
4 changed files with 185 additions and 98 deletions

View File

@ -597,7 +597,7 @@ public class TraversalEngine {
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this // if we don't have a particular interval we're processing, check them all, otherwise only operate at this
// location // location
if ( ( interval == null && inLocations(locus.getLocation()) ) || interval.overlapsP(locus.getLocation()) ) { if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) {
//System.out.format("Working at %s\n", locus.getLocation().toString()); //System.out.format("Working at %s\n", locus.getLocation().toString());

View File

@ -26,8 +26,10 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
long num_loci_total=0; long num_loci_total=0;
long num_loci_confident=0; long num_loci_confident=0;
double LOD_cutoff = 5; double LOD_cutoff = 5;
long hapmap_tp=0; long hapmap_genotype_correct = 0;
long hapmap_fp=0; long hapmap_genotype_incorrect = 0;
long hapmap_refvar_correct = 0;
long hapmap_refvar_incorrect = 0;
AlleleFrequencyWalker caller; AlleleFrequencyWalker caller;
@ -37,70 +39,139 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
num_loci_total += 1; num_loci_total += 1;
boolean is_dbSNP_SNP = false;
boolean has_hapmap_chip_genotype = false;
rodGFF hapmap_chip_genotype = null;
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP )
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
if ( datum instanceof rodGFF )
{
has_hapmap_chip_genotype = true;
hapmap_chip_genotype = (rodGFF)datum;
}
}
}
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; } if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff) if (alleleFreq.qstar > 0.0 && alleleFreq.LOD >= LOD_cutoff)
{ {
// Confident variant. // Confident variant.
num_variants += 1; num_variants += 1;
boolean is_dbSNP_SNP = false;
boolean hapmap_hit = false;
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP )
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
if ( datum instanceof rodGFF )
{
rodGFF hapmap = (rodGFF) datum;
String hapmap_genotype = hapmap.getFeature();
String called_genotype = alleleFreq.asString();
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
// There is a hapmap site here, is it correct?
if (hapmap_genotype.equals(called_genotype))
{
hapmap_tp++;
} else {
hapmap_fp++;
}
}
}
}
if (is_dbSNP_SNP) if (is_dbSNP_SNP)
{ {
dbsnp_hits += 1; dbsnp_hits += 1;
} }
} }
// Convert qstar and LOD to give best qstar and a positive LOD for that qstar
double qstar;
double LOD;
if (alleleFreq.LOD < 0) {
qstar = 0.0;
LOD = -1 * alleleFreq.LOD;
}else{
qstar = alleleFreq.qstar;
LOD = alleleFreq.LOD;
}
//long confident_calls_at_hapmap_chip_sites = 0;
if (LOD >= LOD_cutoff && has_hapmap_chip_genotype) {
//confident_calls_at_hapmap_chip_sites++;
// convert hapmap call to mixture of ref/nonref
String hapmap_genotype = hapmap_chip_genotype.getFeature();
long refs=0, alts=0;
double hapmap_q;
for (char c : hapmap_genotype.toCharArray()) {
if (c == alleleFreq.ref) { refs++; }
if (c == alleleFreq.alt) { alts++; }
}
if (refs+alts > 0) {
hapmap_q = (float)alts / (refs+alts);
}else{
hapmap_q = -1;
}
// Hapmap debug info
System.out.format("HAPMAP %.2f %.2f %.2f ", hapmap_q, qstar, LOD);
String called_genotype = alleleFreq.asString();
System.out.format("%s %s %c %c %.2f", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt, alleleFreq.LOD);
// Calculate genotyping performance - did we get the correct genotype of the N+1 choices?
if (hapmap_q != -1 && hapmap_q == qstar) {
hapmap_genotype_correct++;
System.out.print("\n");
}else{
hapmap_genotype_incorrect++;
System.out.printf(" INCORRECT GENOTYPE Bases: %s\n", AlleleFrequencyWalker.getBases(context));
AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
}
// Now calculate ref / var performance - did we correctly classify the site as
// reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here
boolean hapmap_var = hapmap_q != 0.0;
boolean called_var = qstar != 0.0;
if (hapmap_var != called_var) {
hapmap_refvar_incorrect++;
}else{
hapmap_refvar_correct++;
}
}
/*String hapmap_genotype = hapmap_chip_genotype.getFeature();
String called_genotype = alleleFreq.asString();
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.LOD);
// There is a hapmap site here, is it correct?
if (hapmap_genotype.equals(called_genotype))
{
hapmap_tp++;
} else {
hapmap_fp++;
} */
return alleleFreq; return alleleFreq;
} }
public void printMetrics() public void printMetrics()
{ {
if (num_loci_total == 0) { return; } if (num_loci_total == 0) { return; }
System.out.printf("\n"); System.out.printf("\n");
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff); System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
System.out.printf("METRICS -------------------------------------------------\n"); System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS Total loci : %d\n", num_loci_total); System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
if (num_variants != 0) if (num_variants != 0)
{ {
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp); System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp); System.out.printf("METRICS -- Hapmap Genotyping performance --\n");
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp); System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp)); System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS -- Hapmap Reference/Variant performance --\n");
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
} }
System.out.println(); System.out.println();
} }
@ -118,17 +189,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{ {
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) // Print RESULT data for confident calls
{ if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
if (this.num_loci_total % 10000 == 0) { printMetrics(); } if (this.num_loci_total % 10000 == 0) { printMetrics(); }

View File

@ -12,56 +12,35 @@ import java.util.Arrays;
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstimate, Integer> { public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstimate, Integer> {
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
// Convert context data into bases and 4-base quals public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
// Set number of chromosomes, N, to 2 for now // Set number of chromosomes, N, to 2 for now
int N = 2; int N = 2;
// Convert bases to CharArray // Convert context data into bases and 4-base quals
int numReads = context.getReads().size(); //numReads(); String bases = getBases(context);
byte[] bases = new byte[numReads]; double quals[][] = getOneBaseQuals(context);
String base_string = "";
double[][] quals = new double[numReads][4];
int refnum = nuc2num[ref];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases[i] = read.getReadBases()[offset];
base_string += read.getReadString().charAt(offset);
int callednum = nuc2num[bases[i]];
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
for (int b=0; b<4; b++)
if (b != callednum)
quals[i][b] = nonref_quals;
}
// Count bases // Count bases
int[] base_counts = new int[4]; int[] base_counts = new int[4];
for (byte b : bases) for (byte b : bases.getBytes())
base_counts[nuc2num[b]]++; base_counts[nuc2num[b]]++;
// Find alternate allele - 2nd most frequent non-ref allele // Find alternate allele - 2nd most frequent non-ref allele
// (maybe we should check for ties and eval both or check most common including quality scores) // (maybe we should check for ties and eval both or check most common including quality scores)
int altnum = -1; // start with first base numerical identity (0,1,2,3) int altnum = -1; // start with first base numerical identity (0,1,2,3)
double altcount = -1; // start with first base count double altcount = -1; // start with first base count
int refnum = nuc2num[ref];
for (int b=0; b<4; b++) { for (int b=0; b<4; b++) {
if ((b != refnum) && (base_counts[b] > altcount)) { if ((b != refnum) && (base_counts[b] > altcount)) {
altnum = b; altcount = base_counts[b]; altnum = b; altcount = base_counts[b];
} }
} }
assert(altnum != -1); assert(altnum != -1);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length()); AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
// Print dbSNP data if its there // Print dbSNP data if its there
if (false) { if (false) {
@ -76,6 +55,47 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
return alleleFreq; return alleleFreq;
} }
static public String getBases (LocusContext context) {
// Convert bases to CharArray
int numReads = context.getReads().size(); //numReads();
//byte[] bases = new byte[numReads];
String base_string = "";
//int refnum = nuc2num[ref];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
//bases[i] = read.getReadBases()[offset];
base_string += read.getReadString().charAt(offset);
}
return base_string;
}
static public double[][] getOneBaseQuals (LocusContext context) {
int numReads = context.getReads().size(); //numReads();
double[][] quals = new double[numReads][4];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
int callednum = nuc2num[read.getReadBases()[offset]];
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
for (int b=0; b<4; b++)
if (b != callednum)
quals[i][b] = nonref_quals;
}
return quals;
}
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth) public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
{ {
@ -114,7 +134,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
double pG = P_G(N, qstar_N); double pG = P_G(N, qstar_N);
double posterior = pDq + pqG + pG; double posterior = pDq + pqG + pG;
if (posterior > best_posterior) if (posterior > best_posterior)
{ {
best_qstar = qstar; best_qstar = qstar;
best_qhat = q; best_qhat = q;
@ -124,6 +144,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
best_pqG = pqG; best_pqG = pqG;
best_pG = pG; best_pG = pG;
} }
//System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior);
} }
} }
@ -230,17 +251,9 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum) public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
{ {
if (alleleFreq.LOD >= 5) // Print RESULT data for confident calls
{ if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
return 0; return 0;
} }
@ -267,9 +280,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
void print_base_qual_matrix(double [][]quals, int numReads) { static void print_base_qual_matrix(double [][]quals) {
// Print quals for debugging // Print quals for debugging
System.out.println("Base quality matrix"); System.out.println("Base quality matrix");
int numReads = quals.length;
for (int b=0; b<4; b++) { for (int b=0; b<4; b++) {
System.out.print(num2nuc[b]); System.out.print(num2nuc[b]);
for (int i=0; i < numReads; i++){ for (int i=0; i < numReads; i++){

View File

@ -13,7 +13,7 @@ public class AlleleFrequencyEstimate {
public double LOD; public double LOD;
public int depth; public int depth;
public double getQstar() public double getQstar()
{ {
return qstar; return qstar;
} }
@ -35,6 +35,17 @@ public class AlleleFrequencyEstimate {
this.depth = depth; this.depth = depth;
} }
public String asTabularString() {
return String.format("RESULT %s %c %c %f %f %f %d\n",
location,
ref,
alt,
qhat,
qstar,
LOD,
depth);
}
public String asString() { public String asString() {
// Print out the called bases // Print out the called bases
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref // Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref