diff --git a/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java index 5cd169740..0621d6ccb 100755 --- a/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java @@ -597,7 +597,7 @@ public class TraversalEngine { // if we don't have a particular interval we're processing, check them all, otherwise only operate at this // location - if ( ( interval == null && inLocations(locus.getLocation()) ) || interval.overlapsP(locus.getLocation()) ) { + if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) { //System.out.format("Working at %s\n", locus.getLocation().toString()); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java index 50b160a05..7961389cf 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java @@ -26,8 +26,10 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker= LOD_cutoff) { num_loci_confident += 1; } - if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff) + if (alleleFreq.qstar > 0.0 && alleleFreq.LOD >= LOD_cutoff) { // Confident variant. num_variants += 1; - boolean is_dbSNP_SNP = false; - boolean hapmap_hit = false; - - for ( ReferenceOrderedDatum datum : rodData ) - { - if ( datum != null ) - { - if ( datum instanceof rodDbSNP ) - { - rodDbSNP dbsnp = (rodDbSNP)datum; - if (dbsnp.isSNP()) is_dbSNP_SNP = true; - } - - if ( datum instanceof rodGFF ) - { - rodGFF hapmap = (rodGFF) datum; - String hapmap_genotype = hapmap.getFeature(); - String called_genotype = alleleFreq.asString(); - System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD()); - // There is a hapmap site here, is it correct? - if (hapmap_genotype.equals(called_genotype)) - { - hapmap_tp++; - } else { - hapmap_fp++; - } - } - } - } - if (is_dbSNP_SNP) { dbsnp_hits += 1; } } + // Convert qstar and LOD to give best qstar and a positive LOD for that qstar + double qstar; + double LOD; + if (alleleFreq.LOD < 0) { + qstar = 0.0; + LOD = -1 * alleleFreq.LOD; + }else{ + qstar = alleleFreq.qstar; + LOD = alleleFreq.LOD; + } + + //long confident_calls_at_hapmap_chip_sites = 0; + if (LOD >= LOD_cutoff && has_hapmap_chip_genotype) { + //confident_calls_at_hapmap_chip_sites++; + + // convert hapmap call to mixture of ref/nonref + String hapmap_genotype = hapmap_chip_genotype.getFeature(); + long refs=0, alts=0; + double hapmap_q; + + for (char c : hapmap_genotype.toCharArray()) { + if (c == alleleFreq.ref) { refs++; } + if (c == alleleFreq.alt) { alts++; } + } + + if (refs+alts > 0) { + hapmap_q = (float)alts / (refs+alts); + }else{ + hapmap_q = -1; + } + + // Hapmap debug info + System.out.format("HAPMAP %.2f %.2f %.2f ", hapmap_q, qstar, LOD); + String called_genotype = alleleFreq.asString(); + System.out.format("%s %s %c %c %.2f", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt, alleleFreq.LOD); + + // Calculate genotyping performance - did we get the correct genotype of the N+1 choices? + if (hapmap_q != -1 && hapmap_q == qstar) { + hapmap_genotype_correct++; + System.out.print("\n"); + }else{ + hapmap_genotype_incorrect++; + System.out.printf(" INCORRECT GENOTYPE Bases: %s\n", AlleleFrequencyWalker.getBases(context)); + AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context)); + } + + // Now calculate ref / var performance - did we correctly classify the site as + // reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here + boolean hapmap_var = hapmap_q != 0.0; + boolean called_var = qstar != 0.0; + if (hapmap_var != called_var) { + hapmap_refvar_incorrect++; + }else{ + hapmap_refvar_correct++; + } + + } + + /*String hapmap_genotype = hapmap_chip_genotype.getFeature(); + String called_genotype = alleleFreq.asString(); + System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.LOD); + // There is a hapmap site here, is it correct? + if (hapmap_genotype.equals(called_genotype)) + { + hapmap_tp++; + } else { + hapmap_fp++; + } */ + return alleleFreq; } - public void printMetrics() + public void printMetrics() { if (num_loci_total == 0) { return; } System.out.printf("\n"); System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff); System.out.printf("METRICS -------------------------------------------------\n"); - System.out.printf("METRICS Total loci : %d\n", num_loci_total); - System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); + System.out.printf("METRICS Total loci : %d\n", num_loci_total); + System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); if (num_variants != 0) { - System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); - System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); - System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp); - System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp); - System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp); - System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp)); + System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); + System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); + System.out.printf("METRICS -------------------------------------------------\n"); + System.out.printf("METRICS -- Hapmap Genotyping performance --\n"); + System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect); + System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct); + System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect); + System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect)); + System.out.printf("METRICS -------------------------------------------------\n"); + System.out.printf("METRICS -- Hapmap Reference/Variant performance --\n"); + System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect); + System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct); + System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect); + System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect)); } System.out.println(); } @@ -118,17 +189,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker= 5) || (alleleFreq.LOD <= -5)) - { - System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n", - alleleFreq.location, - alleleFreq.ref, - alleleFreq.alt, - alleleFreq.qhat, - alleleFreq.qstar, - alleleFreq.LOD, - alleleFreq.depth)); - } + // Print RESULT data for confident calls + if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); } if (this.num_loci_total % 10000 == 0) { printMetrics(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java index 870225dde..c3ecd8724 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java @@ -12,56 +12,35 @@ import java.util.Arrays; public class AlleleFrequencyWalker extends BasicLociWalker { - public AlleleFrequencyEstimate map(List rodData, char ref, LocusContext context) { - // Convert context data into bases and 4-base quals + public AlleleFrequencyEstimate map(List rodData, char ref, LocusContext context) { // Set number of chromosomes, N, to 2 for now int N = 2; - // Convert bases to CharArray - int numReads = context.getReads().size(); //numReads(); - byte[] bases = new byte[numReads]; - String base_string = ""; - double[][] quals = new double[numReads][4]; - int refnum = nuc2num[ref]; - - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i =0; i < numReads; i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - bases[i] = read.getReadBases()[offset]; - base_string += read.getReadString().charAt(offset); - int callednum = nuc2num[bases[i]]; - quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); - - // Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual - double nonref_quals = (1.0 - quals[i][callednum]) / 3; - for (int b=0; b<4; b++) - if (b != callednum) - quals[i][b] = nonref_quals; - } + // Convert context data into bases and 4-base quals + String bases = getBases(context); + double quals[][] = getOneBaseQuals(context); // Count bases int[] base_counts = new int[4]; - for (byte b : bases) + for (byte b : bases.getBytes()) base_counts[nuc2num[b]]++; - + // Find alternate allele - 2nd most frequent non-ref allele // (maybe we should check for ties and eval both or check most common including quality scores) int altnum = -1; // start with first base numerical identity (0,1,2,3) double altcount = -1; // start with first base count + int refnum = nuc2num[ref]; for (int b=0; b<4; b++) { if ((b != refnum) && (base_counts[b] > altcount)) { - altnum = b; altcount = base_counts[b]; + altnum = b; altcount = base_counts[b]; } } assert(altnum != -1); - AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length()); + AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases.getBytes(), quals, refnum, altnum, bases.length()); // Print dbSNP data if its there if (false) { @@ -76,6 +55,47 @@ public class AlleleFrequencyWalker extends BasicLociWalker reads = context.getReads(); + List offsets = context.getOffsets(); + for (int i =0; i < numReads; i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + //bases[i] = read.getReadBases()[offset]; + base_string += read.getReadString().charAt(offset); + } + return base_string; + } + + static public double[][] getOneBaseQuals (LocusContext context) { + int numReads = context.getReads().size(); //numReads(); + double[][] quals = new double[numReads][4]; + + List reads = context.getReads(); + List offsets = context.getOffsets(); + for (int i =0; i < numReads; i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + int callednum = nuc2num[read.getReadBases()[offset]]; + quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); + + // Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual + double nonref_quals = (1.0 - quals[i][callednum]) / 3; + for (int b=0; b<4; b++) + if (b != callednum) + quals[i][b] = nonref_quals; + } + + return quals; + } + public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth) { @@ -114,7 +134,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker best_posterior) + if (posterior > best_posterior) { best_qstar = qstar; best_qhat = q; @@ -124,6 +144,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker= 5) - { - System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n", - alleleFreq.location, - alleleFreq.ref, - alleleFreq.alt, - alleleFreq.qhat, - alleleFreq.qstar, - alleleFreq.LOD, - alleleFreq.depth)); - } + // Print RESULT data for confident calls + if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); } + return 0; } @@ -267,9 +280,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker