diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index ea7fde2d8..6c011aea6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -34,6 +33,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import static java.lang.Math.log10; import static java.lang.Math.pow; @@ -69,6 +72,8 @@ import static java.lang.Math.pow; * model. */ public class DiploidSNPGenotypeLikelihoods implements Cloneable { + public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; + protected final static int FIXED_PLOIDY = 2; protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); @@ -84,22 +89,31 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { protected DiploidSNPGenotypePriors priors = null; + // TODO: don't calculate this each time through + protected double log10_PCR_error_3; + protected double log10_1_minus_PCR_error; + /** * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype * */ public DiploidSNPGenotypeLikelihoods() { this.priors = new DiploidSNPGenotypePriors(); + log10_PCR_error_3 = log10(DEFAULT_PCR_ERROR_RATE) - log10_3; + log10_1_minus_PCR_error = log10(1.0 - DEFAULT_PCR_ERROR_RATE); setToZero(); } /** - * Create a new GenotypeLikelhoods object with given priors for each diploid genotype + * Create a new GenotypeLikelhoods object with given priors and PCR error rate for each diploid genotype * - * @param priors priors + * @param priors priors + * @param PCR_error_rate the PCR error rate */ - public DiploidSNPGenotypeLikelihoods(DiploidSNPGenotypePriors priors) { + public DiploidSNPGenotypeLikelihoods(DiploidSNPGenotypePriors priors, double PCR_error_rate) { this.priors = priors; + log10_PCR_error_3 = log10(PCR_error_rate) - log10_3; + log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate); setToZero(); } @@ -242,45 +256,62 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) { int n = 0; - // todo: first validate that my changes were good by passing in fragments representing just a single read... - //Set fragments = new HashSet(); + // TODO-- move this outside the UG, e.g. to ReadBackedPileup + // combine paired reads into a single fragment + HashMap fragments = new HashMap(); for ( PileupElement p : pileup ) { - if ( usableBase(p, ignoreBadBases) ) { + Set fragment = new HashSet(); + String readName = p.getRead().getReadName(); - /**** - Set fragment = new HashSet(); - fragment.add(p); - fragments.add(new PerFragmentPileupElement(fragment)); - } + if ( fragments.containsKey(readName) ) + fragment.addAll(fragments.get(readName).getPileupElements()); + + fragment.add(p); + fragments.put(readName, new PerFragmentPileupElement(fragment)); } // for each fragment, add to the likelihoods - for ( PerFragmentPileupElement fragment : fragments ) { - n += add(fragment, capBaseQualsAtMappingQual); - } - ****/ - - byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual(); - n += add(p.getBase(), qual, p.getRead(), p.getOffset()); - } + for ( PerFragmentPileupElement fragment : fragments.values() ) { + n += add(fragment, ignoreBadBases, capBaseQualsAtMappingQual); } return n; } - // public int add(PerFragmentPileupElement fragment, boolean capBaseQualsAtMappingQual) { + public int add(PerFragmentPileupElement fragment, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) { + // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine + // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. + // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. + byte observedBase1 = 0, observedBase2 = 0, qualityScore1 = 0, qualityScore2 = 0; + for ( PileupElement p : fragment ) { + if ( !usableBase(p, ignoreBadBases) ) + continue; - public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) { - if ( qualityScore == 0 ) { // zero quals are wrong - return 0; + byte qual = p.getQual(); + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MalformedBam(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + if ( capBaseQualsAtMappingQual ) + qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); + + if ( qualityScore1 == 0 ) { + observedBase1 = p.getBase(); + qualityScore1 = qual; + } else { + observedBase2 = p.getBase(); + qualityScore2 = qual; + } } + // abort early if we didn't see any good bases + if ( observedBase1 == 0 && observedBase2 == 0 ) + return 0; + // Just look up the cached result if it's available, or compute and store it DiploidSNPGenotypeLikelihoods gl; - if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) { - gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset); + if ( ! inCache(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY) ) { + gl = calculateCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY); } else { - gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read); + gl = getCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY); } // for bad bases, there are no likelihoods @@ -292,11 +323,10 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { for ( DiploidGenotype g : DiploidGenotype.values() ) { double likelihood = likelihoods[g.ordinal()]; - if ( VERBOSE ) { - boolean fwdStrand = ! read.getReadNegativeStrandFlag(); - System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", - observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); - } + //if ( VERBOSE ) { + // System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", + // observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood); + //} log10Likelihoods[g.ordinal()] += likelihood; log10Posteriors[g.ordinal()] += likelihood; @@ -305,52 +335,50 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { return 1; } - static DiploidSNPGenotypeLikelihoods[][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2]; + static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY]; - protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { - return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null; + protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null; } - protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { - DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read); + protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy); if ( gl == null ) - throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", - observedBase, qualityScore, ploidy, read)); + throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d", + observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy)); return gl; } - protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) { - DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset); - setCache(CACHE, observedBase, qualityScore, ploidy, read, gl); + protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); + setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl); return gl; } - protected void setCache( DiploidSNPGenotypeLikelihoods[][][][] cache, - byte observedBase, byte qualityScore, int ploidy, - SAMRecord read, DiploidSNPGenotypeLikelihoods val ) { - int i = BaseUtils.simpleBaseToBaseIndex(observedBase); - int j = qualityScore; - if ( j > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBam(read, String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, j, read.getReadName())); - int k = ploidy; - int x = strandIndex(! read.getReadNegativeStrandFlag() ); + protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][] cache, + byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy, + DiploidSNPGenotypeLikelihoods val ) { + int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); + int j = qualityScore1; + int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; + int l = qualityScore2; + int m = ploidy; - cache[i][j][k][x] = val; + cache[i][j][k][l][m] = val; } - protected DiploidSNPGenotypeLikelihoods getCache( DiploidSNPGenotypeLikelihoods[][][][] cache, - byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { - int i = BaseUtils.simpleBaseToBaseIndex(observedBase); - int j = qualityScore; - if ( j > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBam(read, String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, j, read.getReadName())); - int k = ploidy; - int x = strandIndex(! read.getReadNegativeStrandFlag() ); - return cache[i][j][k][x]; + protected DiploidSNPGenotypeLikelihoods getCache(DiploidSNPGenotypeLikelihoods[][][][][] cache, + byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); + int j = qualityScore1; + int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; + int l = qualityScore2; + int m = ploidy; + return cache[i][j][k][l][m]; } - protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) { - double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase, qualityScore, read); + protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { + double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); try { @@ -388,24 +416,36 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * Updates likelihoods and posteriors to reflect an additional observation of observedBase with * qualityScore. * - * @param observedBase observed base - * @param qualityScore base quality - * @param read SAM read + * @param observedBase1 the base observed on the 1st read of the fragment + * @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA + * @param observedBase2 the base observed on the 2nd read of the fragment + * @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) */ - protected double[] computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read) { + protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { double[] log10FourBaseLikelihoods = baseZeros.clone(); - for ( byte base : BaseUtils.BASES ) { - double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore); + for ( byte trueBase : BaseUtils.BASES ) { + double likelihood = 0.0; - if ( VERBOSE ) { - boolean fwdStrand = ! read.getReadNegativeStrandFlag(); - System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n", - observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); + for ( byte fragmentBase : BaseUtils.BASES ) { + double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3); + if ( qualityScore1 != 0 ) { + log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1); + } + if ( qualityScore2 != 0 ) { + log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2); + } + + //if ( VERBOSE ) { + // System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n", + // observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood); + //} + + likelihood += pow(10, log10FragmentLikelihood); } - log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood; + log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood); } return log10FourBaseLikelihoods; @@ -444,19 +484,16 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { // // ----------------------------------------------------------------------------------------------------------------- - public static int strandIndex(boolean fwdStrand) { - return fwdStrand ? 0 : 1; - } - /** * Returns true when the observedBase is considered usable. - * @param p pileup element - * @param ignoreBadBases should we ignore bad bases? + * @param p pileup element + * @param ignoreBadBases should we ignore bad bases? * @return true if the base is a usable base */ protected static boolean usableBase(PileupElement p, boolean ignoreBadBases) { - // ignore deletions and filtered bases + // ignore deletions, Q0 bases, and filtered bases if ( p.isDeletion() || + p.getQual() == 0 || (p.getRead() instanceof GATKSAMRecord && !((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset())) ) return false; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerFragmentPileupElement.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerFragmentPileupElement.java new file mode 100755 index 000000000..c15ba2016 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerFragmentPileupElement.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Jan 10, 2011 + * + * Useful helper class to represent a full read pair at a position + */ +public class PerFragmentPileupElement implements Iterable { + protected Set PEs = null; + + public PerFragmentPileupElement(Set PEs) { + this.PEs = new HashSet(PEs); + } + + public Set getPileupElements() { + return PEs; + } + + public Iterator iterator() { + return PEs.iterator(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 596a6be4c..329769ac7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -87,7 +87,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup(); // create the GenotypeLikelihoods object - DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors); + DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); int nGoodBases = GL.add(pileup, true, true); if ( nGoodBases == 0 ) continue; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 89f7d3377..4efc0b8a9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -41,6 +41,9 @@ public class UnifiedArgumentCollection { @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) + public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + // control the output @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public boolean GENOTYPE_MODE = false; @@ -86,6 +89,8 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; + + // indel-related arguments @Argument(fullName = "get_indel_alleles_from_vcf", shortName = "getIndelAllelesFromVCF", doc = "Get reference/alt alleles for indel genotyping from VCF", required = false) public boolean GET_ALLELES_FROM_VCF = false; @@ -100,6 +105,7 @@ public class UnifiedArgumentCollection { uac.GLmodel = GLmodel; uac.heterozygosity = heterozygosity; + uac.PCR_error = PCR_error; uac.GENOTYPE_MODE = GENOTYPE_MODE; uac.ALL_BASES_MODE = ALL_BASES_MODE; uac.NO_SLOD = NO_SLOD; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java index da0e23759..61fd37f04 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java @@ -36,8 +36,10 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.utils.pileup.PileupElement; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.io.PrintStream; @@ -104,9 +106,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker reads = context.getReads(); - if(reads.size() > 0 ) { - List offsets = context.getOffsets(); + if(context.size() > 0 ) { int numAs = 0, numCs = 0, numGs = 0, numTs = 0; //if (DEBUG){ @@ -115,19 +115,16 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker fragment = new HashSet(); + fragment.add(p); + G.add(new PerFragmentPileupElement(fragment), true, false); //if (DEBUG){ if (base == 'A'){numAs++;} else if (base == 'C'){numCs++;} @@ -146,7 +143,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker reduce(Integer value, Pair sum) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index 791cf0d7d..43bc42e38 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.commandline.Argument; @@ -41,6 +42,7 @@ import org.broadinstitute.sting.commandline.Output; import java.io.*; import java.util.ArrayList; +import java.util.HashSet; import java.util.Hashtable; import java.util.List; @@ -340,7 +342,7 @@ public class CallHLAWalker extends LocusWalker>{ ReadBackedPileup pileup = context.getPileup(); long loc = context.getPosition(); - if( context.getReads().size() > 0 ) { + if( context.size() > 0 ) { //out.printf("RG for first read: %s%n",context.getReads().get(0).getReadName()); int numAs = 0, numCs = 0, numGs = 0, numTs = 0,depth = 0; String c1 = "", c2 = ""; @@ -354,25 +356,23 @@ public class CallHLAWalker extends LocusWalker>{ //Calculate posterior probabilities DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(); - SAMRecord read; int offset; char base; byte qual; int mapquality; String readname; - //Check for bad bases and ensure mapping quality myself. This works. - for (int i = 0; i < reads.size(); i++) { - read = reads.get(i); - offset = offsets.get(i); - base = (char)read.getReadBases()[offset]; - qual = read.getBaseQualities()[offset]; - mapquality = read.getMappingQuality(); + for ( PileupElement p : context.getBasePileup() ) { + byte base = p.getBase(); + int mapquality = p.getMappingQual(); if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { - if (ReadsToFilter.contains(read.getReadName())){ + String readname = p.getRead().getReadName(); + if (ReadsToFilter.contains(readname)){ if (DEBUG){ - out.printf("\n%s %s %s %s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),base); + out.printf("\n%s %s %s %s\n",readname,p.getRead().getAlignmentStart(),p.getRead().getAlignmentEnd(),base); } }else{ //consider base in likelihood calculations if it looks good and has high mapping score - G.add((byte)base, qual, read, offset); - readname = read.getReadName(); - if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);} + // TODO-- move this outside, e.g. to ReadBackedPileup + HashSet fragment = new HashSet(); + fragment.add(p); + G.add(new PerFragmentPileupElement(fragment), true, false); + if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(p.getRead());} if (base == 'A'){numAs++; depth++;} else if (base == 'C'){numCs++; depth++;} else if (base == 'T'){numTs++; depth++;} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 54fec9cb0..3fe089ee5 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -25,7 +25,7 @@ public class public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("6c04cc01cf6bebe4bdbc025fd45c5559")); + Arrays.asList("e7514b0f1f2df1ca42815b5c45775f36")); executeTest("testMultiSamplePilot1", spec); } @@ -33,7 +33,7 @@ public class public void testMultiSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList("84efe068164891dbec7c85ff6cc33df3")); + Arrays.asList("49ae129435063f47f0faf00337eb8bf7")); executeTest("testMultiSamplePilot2", spec); } @@ -41,7 +41,7 @@ public class public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("c7decebadb35067a38b9be3c43c1fb76")); + Arrays.asList("8da08fe12bc0d95e548fe63681997038")); executeTest("testSingleSamplePilot2", spec); } @@ -51,7 +51,7 @@ public class // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "66bcb3f7b20d4fbe7849b616bfe69c0f"; + private final static String COMPRESSED_OUTPUT_MD5 = "950ebf5eaa245131dc22aef211526681"; @Test public void testCompressedOutput() { @@ -78,7 +78,7 @@ public class @Test public void testParallelization() { - String md5 = "a0032a9952c94a55e0fa42e2dec33169"; + String md5 = "73845a29994d653a4d3f01b5467a3eed"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -105,12 +105,12 @@ public class @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "4a623d4a68fba4f9f8d7e915af0cc450" ); - e.put( "-all_bases", "2ab5106795c70a63d8dd2353ee0f1427" ); - e.put( "--min_base_quality_score 26", "9c9be22923b9ba0d588f3d37385ae4b0" ); - e.put( "--min_mapping_quality_score 26", "aa36bb2f4fc5bd6b6c4206e0a084a577" ); - e.put( "--max_mismatches_in_40bp_window 5", "4914ed25a8ca22c7aea983999cd6768a" ); - e.put( "--p_nonref_model GRID_SEARCH", "dd4fb1fb304e44b82c9cc3d4cc257172" ); + e.put( "-genotype", "683ce57f2fd3acd5f6fe7599c1ace169" ); + e.put( "-all_bases", "2ddf763c208602693cad942c9ccb804c" ); + e.put( "--min_base_quality_score 26", "5f1cfb9c7f82e6414d5db7aa344813ac" ); + e.put( "--min_mapping_quality_score 26", "6c3ad441f3a23ade292549b1dea80932" ); + e.put( "--max_mismatches_in_40bp_window 5", "5ecaf4281410b67e8e2e164f2ea0d58a" ); + e.put( "--p_nonref_model GRID_SEARCH", "17ffb56d078fdde335a79773e9534ce7" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -124,12 +124,12 @@ public class public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("dd4fb1fb304e44b82c9cc3d4cc257172")); + Arrays.asList("17ffb56d078fdde335a79773e9534ce7")); executeTest("testConfidence1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("eef4e314d0cdae669454232ffbbab8ea")); + Arrays.asList("d49ec8c1476cecb8e3153894cc0f6662")); executeTest("testConfidence2", spec2); } @@ -141,8 +141,8 @@ public class @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "1bbd2e24ddec902339eac481565d7f0a" ); - e.put( 1.0 / 1850, "472d2d6c375a7c6d2edb464a12f10742" ); + e.put( 0.01, "7e820c33a4c688148eec57342645c9b6" ); + e.put( 1.0 / 1850, "c1c48e0c4724b75f12936e22a8629457" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -165,7 +165,7 @@ public class " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("eb4144006eda780d69b6979817ceb58d")); + Arrays.asList("5974d8c21d27d014e2d0bed695b0b42e")); executeTest(String.format("testMultiTechnologies"), spec); } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java index 9cacba212..5b54d110c 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerIntegrationTest.java @@ -47,7 +47,7 @@ public class HLACallerIntegrationTest extends WalkerTest { public void testCalculateBaseLikelihoods() { WalkerTestSpec spec = new WalkerTestSpec( "-T CalculateBaseLikelihoods -I " + validationDataLocation + "NA12878.HISEQ.HLA.bam -R " + b36KGReference + " -L " + intervals + " -filter " + validationDataLocation + "HLA_HISEQ.filter -maxAllowedMismatches 6 -minRequiredMatches 0 -o %s", 1, - Arrays.asList("98e64882f93bf7550457bee4182caab6")); + Arrays.asList("921bb354f3877e5183ca31815546b9fd")); executeTest("test CalculateBaseLikelihoods", spec); }