From 33dcfc858d939d15ef9e3cad0fe450d93cf5777d Mon Sep 17 00:00:00 2001 From: aaron Date: Thu, 19 Nov 2009 23:06:49 +0000 Subject: [PATCH] updates to the paper genotyper based on Mark's comments. There's still more work to do, including more testing. Also a 250% improvement in the getBases() and getQuals() of BasicPileup, which was nearly all of the runtime for the genotyper (using primitives instead of objects when possible). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2097 348d0f76-0448-11de-a6fe-93d51630548a --- .../papergenotyper/GATKPaperGenotyper.java | 35 +++++++--------- .../sting/utils/BasicPileup.java | 40 +++++++++++++------ 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java index a1f32b9da..fffc8d575 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/papergenotyper/GATKPaperGenotyper.java @@ -26,9 +26,6 @@ public class GATKPaperGenotyper extends LocusWalker // the possible diploid genotype strings private static enum GENOTYPE { AA, AC, AG, AT, CC, CG, CT, GG, GT, TT } - // the epsilon value we're using to model our error rate - private static double EPSILON = 1e-4; - @Argument(fullName = "call_location", shortName = "cl", doc = "File to which calls should be written", required = true) private File LOCATION = new File("genotyping.output"); @@ -46,22 +43,20 @@ public class GATKPaperGenotyper extends LocusWalker ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref.getBase(), context.getReads(), context.getOffsets()); double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(), - DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, - DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, + DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); for (GENOTYPE genotype : GENOTYPE.values()) - for (byte pileupBase : pileup.getBases()) { - // todo -- epsilon isn't a constant variable, it's the de-phred error probabilities of the base - // you need to grab the qual score associated with this base and calcluate - // epsilon = pow(10, qual / -10.0) - // Also, only do the calculations below for bases with qual > 0 - for (char genotypeBase : genotype.toString().toCharArray()) - // todo -- all of these calculations should be in log10 space (like the priors are) - if (genotypeBase == pileupBase) - // todo -- potential int flow problems there. Needs parens - likelihoods[genotype.ordinal()] += 1 / 2 * (1 - EPSILON) + EPSILON / 3; - else - likelihoods[genotype.ordinal()] += EPSILON / 3; + for (int index = 0; index < pileup.getBases().length; index++) { + if (pileup.getQuals()[index] > 0) { + double epsilon = Math.pow(10, pileup.getQuals()[index] / -10.0); + byte pileupBase = pileup.getBases()[index]; + for (char genotypeBase : genotype.toString().toCharArray()) + if (genotypeBase == pileupBase) + likelihoods[genotype.ordinal()] += Math.log10(0.5 * ((1 - epsilon) + epsilon / 3)); + else + likelihoods[genotype.ordinal()] += Math.log10(epsilon / 3); + } } Integer sortedList[] = Utils.SortPermutation(likelihoods); @@ -72,8 +67,8 @@ public class GATKPaperGenotyper extends LocusWalker // create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString()) // and calculate the LOD score from best - ref (likelihoods[sortedList[9]] - likelihoods[sortedList[8]) return new SimpleCall(context.getLocation(), - GENOTYPE.values()[sortedList[9]].toString(), - likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]); + GENOTYPE.values()[sortedList[9]].toString(), + likelihoods[sortedList[9]] - likelihoods[GENOTYPE.valueOf(refGenotype).ordinal()]); } /** @@ -111,7 +106,7 @@ public class GATKPaperGenotyper extends LocusWalker /** * when we finish traversing, close the result list - * @param result the final reduce result + * @param result the final reduce result */ public void onTraversalDone(SimpleCallList result) { result.close(); diff --git a/java/src/org/broadinstitute/sting/utils/BasicPileup.java b/java/src/org/broadinstitute/sting/utils/BasicPileup.java index a77a8f198..01994572b 100755 --- a/java/src/org/broadinstitute/sting/utils/BasicPileup.java +++ b/java/src/org/broadinstitute/sting/utils/BasicPileup.java @@ -85,19 +85,19 @@ abstract public class BasicPileup implements Pileup { // byte[] methods // public static byte[] getBases( List reads, List offsets ) { - return ArrayList2byte(getBasesAsArrayList( reads, offsets )); + return getBasesAsArray(reads,offsets,false); } public static byte[] getBases( List reads, List offsets, boolean includeDeletions ) { - return ArrayList2byte(getBasesAsArrayList( reads, offsets, includeDeletions )); + return getBasesAsArray(reads,offsets,includeDeletions); } public static byte[] getQuals( List reads, List offsets ) { - return ArrayList2byte(getQualsAsArrayList( reads, offsets )); + return getQualsAsArray( reads, offsets,false); } public static byte[] getQuals( List reads, List offsets, boolean includeDeletions ) { - return ArrayList2byte(getQualsAsArrayList( reads, offsets, includeDeletions )); + return getQualsAsArray( reads, offsets, includeDeletions); } // @@ -107,18 +107,27 @@ abstract public class BasicPileup implements Pileup { return getBasesAsArrayList( reads, offsets, false ); } - public static ArrayList getBasesAsArrayList( List reads, List offsets, boolean includeDeletions ) { - ArrayList bases = new ArrayList(reads.size()); + public static byte[] getBasesAsArray( List reads, List offsets, boolean includeDeletions ) { + byte array[] = new byte[reads.size()]; + int index = 0; for ( int i = 0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); int offset = offsets.get(i); if ( offset == -1 ) { if ( includeDeletions ) - bases.add((byte)DELETION_CHAR); + array[index++] = ((byte)DELETION_CHAR); } else { - bases.add(read.getReadBases()[offset]); + array[index++] = read.getReadBases()[offset]; } } + return array; + } + + + public static ArrayList getBasesAsArrayList( List reads, List offsets, boolean includeDeletions ) { + ArrayList bases = new ArrayList(reads.size()); + for (byte value : getBasesAsArray(reads, offsets, includeDeletions)) + bases.add(value); return bases; } @@ -128,6 +137,14 @@ abstract public class BasicPileup implements Pileup { public static ArrayList getQualsAsArrayList( List reads, List offsets, boolean includeDeletions ) { ArrayList quals = new ArrayList(reads.size()); + for (byte value : getQualsAsArray(reads, offsets, includeDeletions)) + quals.add(value); + return quals; + } + + public static byte[] getQualsAsArray( List reads, List offsets, boolean includeDeletions ) { + byte array[] = new byte[reads.size()]; + int index = 0; for ( int i = 0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); int offset = offsets.get(i); @@ -135,13 +152,12 @@ abstract public class BasicPileup implements Pileup { // skip deletion sites if ( offset == -1 ) { if ( includeDeletions ) // we need the qual vector to be the same length as base vector! - quals.add((byte)0); + array[index++] = ((byte)0); } else { - byte qual = read.getBaseQualities()[offset]; - quals.add(qual); + array[index++] = read.getBaseQualities()[offset]; } } - return quals; + return array; } public static ArrayList mappingQualPileup( List reads) {