updates to the paper genotyper based on Mark's comments. There's still more work to do, including more testing.

Also a 250% improvement in the getBases() and getQuals() of BasicPileup, which was nearly all of the runtime for the genotyper (using primitives instead of objects when possible).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2097 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-11-19 23:06:49 +00:00
parent 22aaf8c5e0
commit 33dcfc858d
2 changed files with 43 additions and 32 deletions

View File

@ -26,9 +26,6 @@ public class GATKPaperGenotyper extends LocusWalker<SimpleCall, SimpleCallList>
// the possible diploid genotype strings
private static enum GENOTYPE { AA, AC, AG, AT, CC, CG, CT, GG, GT, TT }
// the epsilon value we're using to model our error rate
private static double EPSILON = 1e-4;
@Argument(fullName = "call_location", shortName = "cl", doc = "File to which calls should be written", required = true)
private File LOCATION = new File("genotyping.output");
@ -46,22 +43,20 @@ public class GATKPaperGenotyper extends LocusWalker<SimpleCall, SimpleCallList>
ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref.getBase(), context.getReads(), context.getOffsets());
double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
for (GENOTYPE genotype : GENOTYPE.values())
for (byte pileupBase : pileup.getBases()) {
// todo -- epsilon isn't a constant variable, it's the de-phred error probabilities of the base
// you need to grab the qual score associated with this base and calcluate
// epsilon = pow(10, qual / -10.0)
// Also, only do the calculations below for bases with qual > 0
for (char genotypeBase : genotype.toString().toCharArray())
// todo -- all of these calculations should be in log10 space (like the priors are)
if (genotypeBase == pileupBase)
// todo -- potential int flow problems there. Needs parens
likelihoods[genotype.ordinal()] += 1 / 2 * (1 - EPSILON) + EPSILON / 3;
else
likelihoods[genotype.ordinal()] += EPSILON / 3;
for (int index = 0; index < pileup.getBases().length; index++) {
if (pileup.getQuals()[index] > 0) {
double epsilon = Math.pow(10, pileup.getQuals()[index] / -10.0);
byte pileupBase = pileup.getBases()[index];
for (char genotypeBase : genotype.toString().toCharArray())
if (genotypeBase == pileupBase)
likelihoods[genotype.ordinal()] += Math.log10(0.5 * ((1 - epsilon) + epsilon / 3));
else
likelihoods[genotype.ordinal()] += Math.log10(epsilon / 3);
}
}
Integer sortedList[] = Utils.SortPermutation(likelihoods);
@ -72,8 +67,8 @@ public class GATKPaperGenotyper extends LocusWalker<SimpleCall, SimpleCallList>
// create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString())
// and calculate the LOD score from best - ref (likelihoods[sortedList[9]] - likelihoods[sortedList[8])
return new SimpleCall(context.getLocation(),
GENOTYPE.values()[sortedList[9]].toString(),
likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]);
GENOTYPE.values()[sortedList[9]].toString(),
likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]);
}
/**
@ -111,7 +106,7 @@ public class GATKPaperGenotyper extends LocusWalker<SimpleCall, SimpleCallList>
/**
* when we finish traversing, close the result list
* @param result the final reduce result
* @param result the final reduce result
*/
public void onTraversalDone(SimpleCallList result) {
result.close();

View File

@ -85,19 +85,19 @@ abstract public class BasicPileup implements Pileup {
// byte[] methods
//
public static byte[] getBases( List<SAMRecord> reads, List<Integer> offsets ) {
return ArrayList2byte(getBasesAsArrayList( reads, offsets ));
return getBasesAsArray(reads,offsets,false);
}
public static byte[] getBases( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
return ArrayList2byte(getBasesAsArrayList( reads, offsets, includeDeletions ));
return getBasesAsArray(reads,offsets,includeDeletions);
}
public static byte[] getQuals( List<SAMRecord> reads, List<Integer> offsets ) {
return ArrayList2byte(getQualsAsArrayList( reads, offsets ));
return getQualsAsArray( reads, offsets,false);
}
public static byte[] getQuals( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
return ArrayList2byte(getQualsAsArrayList( reads, offsets, includeDeletions ));
return getQualsAsArray( reads, offsets, includeDeletions);
}
//
@ -107,18 +107,27 @@ abstract public class BasicPileup implements Pileup {
return getBasesAsArrayList( reads, offsets, false );
}
public static ArrayList<Byte> getBasesAsArrayList( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
ArrayList<Byte> bases = new ArrayList<Byte>(reads.size());
public static byte[] getBasesAsArray( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
byte array[] = new byte[reads.size()];
int index = 0;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
if ( offset == -1 ) {
if ( includeDeletions )
bases.add((byte)DELETION_CHAR);
array[index++] = ((byte)DELETION_CHAR);
} else {
bases.add(read.getReadBases()[offset]);
array[index++] = read.getReadBases()[offset];
}
}
return array;
}
public static ArrayList<Byte> getBasesAsArrayList( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
ArrayList<Byte> bases = new ArrayList<Byte>(reads.size());
for (byte value : getBasesAsArray(reads, offsets, includeDeletions))
bases.add(value);
return bases;
}
@ -128,6 +137,14 @@ abstract public class BasicPileup implements Pileup {
public static ArrayList<Byte> getQualsAsArrayList( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
ArrayList<Byte> quals = new ArrayList<Byte>(reads.size());
for (byte value : getQualsAsArray(reads, offsets, includeDeletions))
quals.add(value);
return quals;
}
public static byte[] getQualsAsArray( List<SAMRecord> reads, List<Integer> offsets, boolean includeDeletions ) {
byte array[] = new byte[reads.size()];
int index = 0;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
@ -135,13 +152,12 @@ abstract public class BasicPileup implements Pileup {
// skip deletion sites
if ( offset == -1 ) {
if ( includeDeletions ) // we need the qual vector to be the same length as base vector!
quals.add((byte)0);
array[index++] = ((byte)0);
} else {
byte qual = read.getBaseQualities()[offset];
quals.add(qual);
array[index++] = read.getBaseQualities()[offset];
}
}
return quals;
return array;
}
public static ArrayList<Byte> mappingQualPileup( List<SAMRecord> reads) {