- A lot of code cleaned up; separated metrics code from AlleleFrequencyMetricsWalker into AlleleMetrics and eliminated the former class. AFMW (aside from being a name so long that it warrants an acronym) can now be implemented by passing an option to AlleleFreqeuncyWalker that logs metrics to a file.

- AlleleMetrics and AlleleMetricrsWalker are now ready to take a list of clasess that implement the AllelicVariant interface
- Switched a genome location in AlleleFrequencyEstimate from String to GenomeLoc which makes way more sense.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@280 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-04-03 02:09:10 +00:00
parent c6ab60ee04
commit e3ac0cb500
4 changed files with 101 additions and 81 deletions

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.playground.utils.AlleleMetrics;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -15,11 +16,13 @@ import java.util.Arrays;
import java.util.Random; import java.util.Random;
import java.io.PrintStream; import java.io.PrintStream;
public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate, String> public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate, String>// implements AllelicVariant
{ {
@Argument public int N; @Argument public int N;
@Argument(required=false,defaultValue="0") public int DOWNSAMPLE; @Argument(required=false,defaultValue="0") public int DOWNSAMPLE;
@Argument public String GFF_OUTPUT_FILE; @Argument public String GFF_OUTPUT_FILE;
@Argument(shortName="met", doc="Turns on logging of metrics on the fly with AlleleFrequency calculation") public boolean LOG_METRICS;
@Argument(required=false, defaultValue="", doc="Name of file where metrics will output") public String METRICS_OUTPUT_FILE;
protected static Logger logger = Logger.getLogger(AlleleFrequencyWalker.class); protected static Logger logger = Logger.getLogger(AlleleFrequencyWalker.class);
@ -30,7 +33,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
{ {
// Convert context data into bases and 4-base quals // Convert context data into bases and 4-base quals
String bases = getBases(context); String bases = getBases(context);
double quals[][] = getOneBaseQuals(context); double quals[][] = getQuals(context);
/* /*
// DEBUG: print the data for a read // DEBUG: print the data for a read
@ -87,7 +90,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
} }
assert(altnum != -1); assert(altnum != -1);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases.getBytes(), quals, refnum, altnum, bases.length()); AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
alleleFreq.notes = String.format("A:%d C:%d G:%d T:%d", alleleFreq.notes = String.format("A:%d C:%d G:%d T:%d",
base_counts[nuc2num['A']], base_counts[nuc2num['A']],
@ -108,6 +111,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
logger.debug(String.format(" => result is %s", alleleFreq)); logger.debug(String.format(" => result is %s", alleleFreq));
if (LOG_METRICS) metrics.nextPosition(alleleFreq, rodData);
return alleleFreq; return alleleFreq;
} }
@ -128,7 +133,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
return new String(bases); return new String(bases);
} }
static public double[][] getOneBaseQuals (LocusContext context) static public double[][] getQuals (LocusContext context)
{ {
int numReads = context.getReads().size(); //numReads(); int numReads = context.getReads().size(); //numReads();
double[][] quals = new double[numReads][4]; double[][] quals = new double[numReads][4];
@ -155,13 +160,13 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
}else{ }else{
assert (SQ_field instanceof byte[]); assert (SQ_field instanceof byte[]);
byte[] hex_quals = (byte[]) SQ_field; byte[] hex_quals = (byte[]) SQ_field;
System.out.printf("SQ field (hex): %s\n", bytesToHexString(hex_quals)); //System.out.printf("SQ field (hex): %s\n", bytesToHexString(hex_quals));
System.out.printf("SAM record: %s\n", read.format()); //System.out.printf("SAM record: %s\n", read.format());
int hex_qual = hex_quals[offset]; int hex_qual = hex_quals[offset];
int called2num = hex_qual & 0x3; int called2num = hex_qual & 0x3;
double qual2 = (double)(hex_qual >> 2) / 100.0; double qual2 = (double)(hex_qual >> 2) / 100.0;
System.out.printf("2ND %x %d %f\n", hex_qual, called2num, qual2); //System.out.printf("2ND %x %d %f\n", hex_qual, called2num, qual2);
quals[i][called2num] = qual2; quals[i][called2num] = qual2;
// //
@ -191,7 +196,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10)); return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10));
} }
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth) public AlleleFrequencyEstimate AlleleFrequencyEstimator(GenomeLoc location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
{ {
// q = hypothetical %nonref // q = hypothetical %nonref
@ -251,7 +256,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
Arrays.sort(bestMixtures); Arrays.sort(bestMixtures);
// Calculate Lod of the mixture versus other possible // Calculate Lod of the mixture versus other possible
// Answers how confident are we in the best mixture versus the next best mixture // Answers how confident are we in the best mixture versus the nextPosition best mixture
double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior; double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior;
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location, AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
@ -375,48 +380,42 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
return result; return result;
} }
private String confident_ref_interval_start = ""; private String confident_ref_interval_contig = "";
private long confident_ref_interval_start = 0;
private double confident_ref_interval_LOD_sum = 0; private double confident_ref_interval_LOD_sum = 0;
private double confident_ref_interval_length = 0; private double confident_ref_interval_length = 0;
private int last_position_considered = -1; private long last_position_considered = -1;
private boolean inside_confident_ref_interval = false; private boolean inside_confident_ref_interval = false;
AlleleMetrics metrics;
public String reduceInit() public String reduceInit()
{ {
confident_ref_interval_start = ""; confident_ref_interval_contig = "";
confident_ref_interval_start = 0;
confident_ref_interval_LOD_sum = 0; confident_ref_interval_LOD_sum = 0;
confident_ref_interval_length = 0; confident_ref_interval_length = 0;
last_position_considered = -1; last_position_considered = -1;
inside_confident_ref_interval = false; inside_confident_ref_interval = false;
return ""; if (LOG_METRICS) metrics = new AlleleMetrics("SNTH");//METRICS_OUTPUT_FILE);
return "";
} }
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{ {
// Print RESULT data for confident calls // Print RESULT data for confident calls
String[] tokens; long current_offset = alleleFreq.location.getStart(); //Integer.parseInt(tokens[1]);
tokens = alleleFreq.location.split(":");
int current_offset = Integer.parseInt(tokens[1]);
if (inside_confident_ref_interval && if (inside_confident_ref_interval &&
((alleleFreq.lodVsRef > -5.0) || (current_offset != last_position_considered + 1)) ) ((alleleFreq.lodVsRef > -5.0) || (current_offset != last_position_considered + 1)) )
{ {
// No longer hom-ref, so output a ref line. // No longer hom-ref, so output a ref line.
tokens = confident_ref_interval_start.split(":");
String contig = tokens[0];
int start = Integer.parseInt(tokens[1]);
tokens = alleleFreq.location.split(":");
int end = last_position_considered;
double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length; double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length;
output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n", output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n",
contig, confident_ref_interval_contig,
start, confident_ref_interval_start,
end, last_position_considered,
lod, lod,
(int)(confident_ref_interval_length)); (int)(confident_ref_interval_length));
@ -435,7 +434,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef <= -5.0)) else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef <= -5.0))
{ {
// We moved into a hom-ref region so start a new interval. // We moved into a hom-ref region so start a new interval.
confident_ref_interval_start = alleleFreq.location; confident_ref_interval_contig = alleleFreq.location.getContig();
confident_ref_interval_start = alleleFreq.location.getStart();
confident_ref_interval_LOD_sum = alleleFreq.lodVsRef; confident_ref_interval_LOD_sum = alleleFreq.lodVsRef;
confident_ref_interval_length = 1; confident_ref_interval_length = 1;
inside_confident_ref_interval = true; inside_confident_ref_interval = true;
@ -444,6 +444,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
last_position_considered = current_offset; last_position_considered = current_offset;
if (alleleFreq.lodVsRef >= 5) { this.output.print(alleleFreq.asGFFString()); } if (alleleFreq.lodVsRef >= 5) { this.output.print(alleleFreq.asGFFString()); }
if (LOG_METRICS) metrics.printMetricsAtLocusIntervals(1000);
return ""; return "";
} }
@ -492,17 +493,13 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
if (inside_confident_ref_interval) if (inside_confident_ref_interval)
{ {
// if we have a confident reference interval still hanging open, close it. // if we have a confident reference interval still hanging open, close it.
String tokens[] = confident_ref_interval_start.split(":");
String contig = tokens[0];
int start = Integer.parseInt(tokens[1]);
int end = last_position_considered;
double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length; double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length;
output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n", output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n",
contig, confident_ref_interval_contig,
start, confident_ref_interval_start,
end, last_position_considered,
lod, lod,
(int)(confident_ref_interval_length)); (int)(confident_ref_interval_length));
@ -522,6 +519,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
e.printStackTrace(); e.printStackTrace();
System.exit(-1); System.exit(-1);
} }
if (LOG_METRICS) metrics.printMetrics();
} }
static void print_base_qual_matrix(double [][]quals) { static void print_base_qual_matrix(double [][]quals) {
@ -569,7 +568,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}}; {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}};
AlleleFrequencyWalker w = new AlleleFrequencyWalker(); AlleleFrequencyWalker w = new AlleleFrequencyWalker();
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator("null", N, het_bases, het_quals, 0, 1, 20); AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20);
System.out.print(String.format("50%% Het : %s %c %c %f %f %f %d %s\n", System.out.print(String.format("50%% Het : %s %c %c %f %f %f %d %s\n",
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null")); "null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null"));
} }
@ -686,7 +685,7 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
int N = 10; int N = 10;
AlleleFrequencyWalker w = new AlleleFrequencyWalker(); AlleleFrequencyWalker w = new AlleleFrequencyWalker();
w.N = 10; w.N = 10;
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator("null", N, het_bases, het_quals, 0, 1, 20); AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20);
System.out.print(String.format("10%% Het : %s %c %c %f %f %f %d %s\n", System.out.print(String.format("10%% Het : %s %c %c %f %f %f %d %s\n",
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null")); "null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null"));
} }

View File

@ -0,0 +1,27 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: andrewk
* Date: Apr 2, 2009
* Time: 9:01:44 PM
* To change this template use File | Settings | File Templates.
*/
public class AlleleMetricsWalker {
// Class that will walk over various metrics in a reference ordered way
// This class walks over the genome in reference order and calls AlleleMetrics on each class
// Hapmap and dbSNP tracks are taken from the command line
// At first pass, this will at least be able to walk over a GFF file and compare to the hapmap and dbsnp
// tracks specified on the command line and handed in via the LocusContext
public void map(List<AllelicVariant> avdata) {
}
}

View File

@ -1,10 +1,11 @@
package org.broadinstitute.sting.playground.utils; package org.broadinstitute.sting.playground.utils;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
public class AlleleFrequencyEstimate { public class AlleleFrequencyEstimate {
//AlleleFrequencyEstimate(); //AlleleFrequencyEstimate();
public String location; public GenomeLoc location;
public char ref; public char ref;
public char alt; public char alt;
public int N; public int N;
@ -15,7 +16,9 @@ public class AlleleFrequencyEstimate {
public int depth; public int depth;
public String notes; public String notes;
public AlleleFrequencyEstimate(String location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, int depth) GenomeLoc l;
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, int depth)
{ {
this.location = location; this.location = location;
this.ref = ref; this.ref = ref;
@ -31,12 +34,10 @@ public class AlleleFrequencyEstimate {
public String asGFFString() public String asGFFString()
{ {
String[] tokens;
tokens = location.split(":");
return String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\tREF %c\t;\tALT %c\t;\tFREQ %f\n", return String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\tREF %c\t;\tALT %c\t;\tFREQ %f\n",
tokens[0], location.getContig(),
tokens[1], location.getStart(),
tokens[1], location.getStart(),
lodVsRef, lodVsRef,
ref, ref,
alt, alt,

View File

@ -1,25 +1,21 @@
package org.broadinstitute.sting.playground.gatk.walkers; package org.broadinstitute.sting.playground.utils;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import java.util.List; import java.util.List;
import java.io.PrintStream;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
* User: andrewk * User: andrewk
* Date: Mar 18, 2009 * Date: Apr 1, 2009
* Time: 5:28:58 PM * Time: 5:53:21 PM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class AlleleMetrics {
public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEstimate, String>
{
long dbsnp_hits=0; long dbsnp_hits=0;
long num_variants=0; long num_variants=0;
@ -31,12 +27,24 @@ public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEst
long hapmap_refvar_correct = 0; long hapmap_refvar_correct = 0;
long hapmap_refvar_incorrect = 0; long hapmap_refvar_incorrect = 0;
AlleleFrequencyWalker caller; protected PrintStream out;
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) public AlleleMetrics(String MetricsOutputFile) {
{ try
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context); {
/*if ( MetricsOutputFile.equals("-") )
this.out = out;
else*/
this.out = new PrintStream(MetricsOutputFile);
}
catch (Exception e)
{
e.printStackTrace();
System.exit(-1);
}
}
public void nextPosition(AlleleFrequencyEstimate alleleFreq, List<ReferenceOrderedDatum> rodData) {
num_loci_total += 1; num_loci_total += 1;
boolean is_dbSNP_SNP = false; boolean is_dbSNP_SNP = false;
@ -47,7 +55,7 @@ public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEst
{ {
if ( datum != null ) if ( datum != null )
{ {
if ( datum instanceof rodDbSNP ) if ( datum instanceof rodDbSNP)
{ {
rodDbSNP dbsnp = (rodDbSNP)datum; rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true; if (dbsnp.isSNP()) is_dbSNP_SNP = true;
@ -64,9 +72,9 @@ public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEst
if (Math.abs(alleleFreq.lodVsRef) >= LOD_cutoff) { num_loci_confident += 1; } if (Math.abs(alleleFreq.lodVsRef) >= LOD_cutoff) { num_loci_confident += 1; }
if (alleleFreq.qstar > 0.0 && alleleFreq.lodVsRef >= LOD_cutoff) if (alleleFreq.qstar > 0.0 && alleleFreq.lodVsRef >= LOD_cutoff)
{ {
// Confident variant. // Confident variant.
num_variants += 1; num_variants += 1;
if (is_dbSNP_SNP) if (is_dbSNP_SNP)
@ -105,7 +113,8 @@ public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEst
hapmap_genotype_correct++; hapmap_genotype_correct++;
}else{ }else{
hapmap_genotype_incorrect++; hapmap_genotype_incorrect++;
System.out.printf(" INCORRECT GENOTYPE Bases: %s", AlleleFrequencyWalker.getBases(context)); //System.out.printf(" INCORRECT GENOTYPE Bases: %s", AlleleFrequencyWalker.getBases(context));
System.out.printf(" INCORRECT GENOTYPE");
//AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context)); //AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
} }
} }
@ -118,16 +127,14 @@ public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEst
boolean called_var = alleleFreq.qstar != 0.0; boolean called_var = alleleFreq.qstar != 0.0;
if (hapmap_q != -1 && hapmap_var != called_var) { if (hapmap_q != -1 && hapmap_var != called_var) {
hapmap_refvar_incorrect++; hapmap_refvar_incorrect++;
System.out.printf(" INCORRECT REFVAR CALL");
}else{ }else{
hapmap_refvar_correct++; hapmap_refvar_correct++;
System.out.printf(" INCORRECT REFVAR CALL Bases: %s\n", AlleleFrequencyWalker.getBases(context));
} }
} }
out.print("\n"); out.print("\n");
} }
return alleleFreq;
} }
public void printMetrics() public void printMetrics()
@ -159,23 +166,9 @@ public class AlleleFrequencyMetricsWalker extends LocusWalker<AlleleFrequencyEst
out.println(); out.println();
} }
public void onTraversalDone(String result) public void printMetricsAtLocusIntervals(int loci_interval) {
{ if (num_loci_total % loci_interval == 0) printMetrics();
printMetrics();
} }
public String reduceInit()
{
caller = new AlleleFrequencyWalker();
return "";
}
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
if ((alleleFreq.lodVsRef >= 5) || (alleleFreq.lodVsRef <= -5)) { System.out.print(alleleFreq.asGFFString()); }
if (this.num_loci_total % 1000 == 0) { printMetrics(); }
return "null";
}
} }