Committing the fragment-based calling code. Results look great in all datasets (will show this at 1000G this week with Ryan). Note that this is an intermediate commit. The code needs to be cleaned up and the fragmentation code needs to be moved up into LocusIteratorByState. This should all happen later this week, but I don't want Ryan to have to keep running from my own personal Sting directory. The current crappy implementation adds ~10% to the runtime, but that should all go away in the next iteration.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5058 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bb6999b032
commit
2bbcc9275a
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -34,6 +33,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
|
|
@ -69,6 +72,8 @@ import static java.lang.Math.pow;
|
|||
* model.
|
||||
*/
|
||||
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
||||
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
|
||||
|
||||
protected final static int FIXED_PLOIDY = 2;
|
||||
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
|
||||
protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);
|
||||
|
|
@ -84,22 +89,31 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
|
||||
protected DiploidSNPGenotypePriors priors = null;
|
||||
|
||||
// TODO: don't calculate this each time through
|
||||
protected double log10_PCR_error_3;
|
||||
protected double log10_1_minus_PCR_error;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*
|
||||
*/
|
||||
public DiploidSNPGenotypeLikelihoods() {
|
||||
this.priors = new DiploidSNPGenotypePriors();
|
||||
log10_PCR_error_3 = log10(DEFAULT_PCR_ERROR_RATE) - log10_3;
|
||||
log10_1_minus_PCR_error = log10(1.0 - DEFAULT_PCR_ERROR_RATE);
|
||||
setToZero();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||
* Create a new GenotypeLikelhoods object with given priors and PCR error rate for each diploid genotype
|
||||
*
|
||||
* @param priors priors
|
||||
* @param priors priors
|
||||
* @param PCR_error_rate the PCR error rate
|
||||
*/
|
||||
public DiploidSNPGenotypeLikelihoods(DiploidSNPGenotypePriors priors) {
|
||||
public DiploidSNPGenotypeLikelihoods(DiploidSNPGenotypePriors priors, double PCR_error_rate) {
|
||||
this.priors = priors;
|
||||
log10_PCR_error_3 = log10(PCR_error_rate) - log10_3;
|
||||
log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate);
|
||||
setToZero();
|
||||
}
|
||||
|
||||
|
|
@ -242,45 +256,62 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
int n = 0;
|
||||
|
||||
// todo: first validate that my changes were good by passing in fragments representing just a single read...
|
||||
//Set<PerFragmentPileupElement> fragments = new HashSet<PerFragmentPileupElement>();
|
||||
// TODO-- move this outside the UG, e.g. to ReadBackedPileup
|
||||
// combine paired reads into a single fragment
|
||||
HashMap<String, PerFragmentPileupElement> fragments = new HashMap<String, PerFragmentPileupElement>();
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( usableBase(p, ignoreBadBases) ) {
|
||||
Set<PileupElement> fragment = new HashSet<PileupElement>();
|
||||
String readName = p.getRead().getReadName();
|
||||
|
||||
/****
|
||||
Set<PileupElement> fragment = new HashSet<PileupElement>();
|
||||
fragment.add(p);
|
||||
fragments.add(new PerFragmentPileupElement(fragment));
|
||||
}
|
||||
if ( fragments.containsKey(readName) )
|
||||
fragment.addAll(fragments.get(readName).getPileupElements());
|
||||
|
||||
fragment.add(p);
|
||||
fragments.put(readName, new PerFragmentPileupElement(fragment));
|
||||
}
|
||||
|
||||
// for each fragment, add to the likelihoods
|
||||
for ( PerFragmentPileupElement fragment : fragments ) {
|
||||
n += add(fragment, capBaseQualsAtMappingQual);
|
||||
}
|
||||
****/
|
||||
|
||||
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
|
||||
n += add(p.getBase(), qual, p.getRead(), p.getOffset());
|
||||
}
|
||||
for ( PerFragmentPileupElement fragment : fragments.values() ) {
|
||||
n += add(fragment, ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
// public int add(PerFragmentPileupElement fragment, boolean capBaseQualsAtMappingQual) {
|
||||
public int add(PerFragmentPileupElement fragment, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
|
||||
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
|
||||
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
|
||||
byte observedBase1 = 0, observedBase2 = 0, qualityScore1 = 0, qualityScore2 = 0;
|
||||
for ( PileupElement p : fragment ) {
|
||||
if ( !usableBase(p, ignoreBadBases) )
|
||||
continue;
|
||||
|
||||
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
if ( qualityScore == 0 ) { // zero quals are wrong
|
||||
return 0;
|
||||
byte qual = p.getQual();
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBam(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
|
||||
if ( qualityScore1 == 0 ) {
|
||||
observedBase1 = p.getBase();
|
||||
qualityScore1 = qual;
|
||||
} else {
|
||||
observedBase2 = p.getBase();
|
||||
qualityScore2 = qual;
|
||||
}
|
||||
}
|
||||
|
||||
// abort early if we didn't see any good bases
|
||||
if ( observedBase1 == 0 && observedBase2 == 0 )
|
||||
return 0;
|
||||
|
||||
// Just look up the cached result if it's available, or compute and store it
|
||||
DiploidSNPGenotypeLikelihoods gl;
|
||||
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) {
|
||||
gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset);
|
||||
if ( ! inCache(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY) ) {
|
||||
gl = calculateCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
|
||||
} else {
|
||||
gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read);
|
||||
gl = getCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
|
||||
}
|
||||
|
||||
// for bad bases, there are no likelihoods
|
||||
|
|
@ -292,11 +323,10 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihood = likelihoods[g.ordinal()];
|
||||
|
||||
if ( VERBOSE ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
//if ( VERBOSE ) {
|
||||
// System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
// observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood);
|
||||
//}
|
||||
|
||||
log10Likelihoods[g.ordinal()] += likelihood;
|
||||
log10Posteriors[g.ordinal()] += likelihood;
|
||||
|
|
@ -305,52 +335,50 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
return 1;
|
||||
}
|
||||
|
||||
static DiploidSNPGenotypeLikelihoods[][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2];
|
||||
static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];
|
||||
|
||||
protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null;
|
||||
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
|
||||
return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null;
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read);
|
||||
protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
|
||||
DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy);
|
||||
if ( gl == null )
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d",
|
||||
observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy));
|
||||
return gl;
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) {
|
||||
DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||
setCache(CACHE, observedBase, qualityScore, ploidy, read, gl);
|
||||
protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
|
||||
DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
|
||||
setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl);
|
||||
return gl;
|
||||
}
|
||||
|
||||
protected void setCache( DiploidSNPGenotypeLikelihoods[][][][] cache,
|
||||
byte observedBase, byte qualityScore, int ploidy,
|
||||
SAMRecord read, DiploidSNPGenotypeLikelihoods val ) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
if ( j > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBam(read, String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, j, read.getReadName()));
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||
protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][] cache,
|
||||
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy,
|
||||
DiploidSNPGenotypeLikelihoods val ) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
|
||||
int j = qualityScore1;
|
||||
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
|
||||
int l = qualityScore2;
|
||||
int m = ploidy;
|
||||
|
||||
cache[i][j][k][x] = val;
|
||||
cache[i][j][k][l][m] = val;
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods getCache( DiploidSNPGenotypeLikelihoods[][][][] cache,
|
||||
byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
if ( j > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBam(read, String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, j, read.getReadName()));
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||
return cache[i][j][k][x];
|
||||
protected DiploidSNPGenotypeLikelihoods getCache(DiploidSNPGenotypeLikelihoods[][][][][] cache,
|
||||
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
|
||||
int j = qualityScore1;
|
||||
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
|
||||
int l = qualityScore2;
|
||||
int m = ploidy;
|
||||
return cache[i][j][k][l][m];
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase, qualityScore, read);
|
||||
protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
|
||||
double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
|
||||
|
||||
try {
|
||||
|
||||
|
|
@ -388,24 +416,36 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param qualityScore base quality
|
||||
* @param read SAM read
|
||||
* @param observedBase1 the base observed on the 1st read of the fragment
|
||||
* @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA
|
||||
* @param observedBase2 the base observed on the 2nd read of the fragment
|
||||
* @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA
|
||||
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
|
||||
*/
|
||||
protected double[] computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read) {
|
||||
protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
|
||||
double[] log10FourBaseLikelihoods = baseZeros.clone();
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore);
|
||||
for ( byte trueBase : BaseUtils.BASES ) {
|
||||
double likelihood = 0.0;
|
||||
|
||||
if ( VERBOSE ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
for ( byte fragmentBase : BaseUtils.BASES ) {
|
||||
double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3);
|
||||
if ( qualityScore1 != 0 ) {
|
||||
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1);
|
||||
}
|
||||
if ( qualityScore2 != 0 ) {
|
||||
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2);
|
||||
}
|
||||
|
||||
//if ( VERBOSE ) {
|
||||
// System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n",
|
||||
// observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood);
|
||||
//}
|
||||
|
||||
likelihood += pow(10, log10FragmentLikelihood);
|
||||
}
|
||||
|
||||
log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood;
|
||||
log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood);
|
||||
}
|
||||
|
||||
return log10FourBaseLikelihoods;
|
||||
|
|
@ -444,19 +484,16 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static int strandIndex(boolean fwdStrand) {
|
||||
return fwdStrand ? 0 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the observedBase is considered usable.
|
||||
* @param p pileup element
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @param p pileup element
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @return true if the base is a usable base
|
||||
*/
|
||||
protected static boolean usableBase(PileupElement p, boolean ignoreBadBases) {
|
||||
// ignore deletions and filtered bases
|
||||
// ignore deletions, Q0 bases, and filtered bases
|
||||
if ( p.isDeletion() ||
|
||||
p.getQual() == 0 ||
|
||||
(p.getRead() instanceof GATKSAMRecord &&
|
||||
!((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset())) )
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Jan 10, 2011
|
||||
*
|
||||
* Useful helper class to represent a full read pair at a position
|
||||
*/
|
||||
public class PerFragmentPileupElement implements Iterable<PileupElement> {
|
||||
protected Set<PileupElement> PEs = null;
|
||||
|
||||
public PerFragmentPileupElement(Set<PileupElement> PEs) {
|
||||
this.PEs = new HashSet<PileupElement>(PEs);
|
||||
}
|
||||
|
||||
public Set<PileupElement> getPileupElements() {
|
||||
return PEs;
|
||||
}
|
||||
|
||||
public Iterator<PileupElement> iterator() {
|
||||
return PEs.iterator();
|
||||
}
|
||||
}
|
||||
|
|
@ -87,7 +87,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup();
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors);
|
||||
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
|
||||
int nGoodBases = GL.add(pileup, true, true);
|
||||
if ( nGoodBases == 0 )
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -41,6 +41,9 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
@Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
|
||||
public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
|
||||
|
||||
// control the output
|
||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||
public boolean GENOTYPE_MODE = false;
|
||||
|
|
@ -86,6 +89,8 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
|
||||
// indel-related arguments
|
||||
@Argument(fullName = "get_indel_alleles_from_vcf", shortName = "getIndelAllelesFromVCF", doc = "Get reference/alt alleles for indel genotyping from VCF", required = false)
|
||||
public boolean GET_ALLELES_FROM_VCF = false;
|
||||
|
||||
|
|
@ -100,6 +105,7 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.PCR_error = PCR_error;
|
||||
uac.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||
uac.ALL_BASES_MODE = ALL_BASES_MODE;
|
||||
uac.NO_SLOD = NO_SLOD;
|
||||
|
|
|
|||
|
|
@ -36,8 +36,10 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
|||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
|
@ -104,9 +106,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
if(reads.size() > 0 ) {
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
if(context.size() > 0 ) {
|
||||
|
||||
int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
|
||||
//if (DEBUG){
|
||||
|
|
@ -115,19 +115,16 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
|
||||
//Calculate posterior probabilities
|
||||
DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods();
|
||||
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
|
||||
|
||||
//Check for bad bases and ensure mapping quality
|
||||
for (int i = 0; i < reads.size(); i++) {
|
||||
read = reads.get(i);
|
||||
offset = offsets.get(i);
|
||||
base = (char)read.getReadBases()[offset];
|
||||
qual = read.getBaseQualities()[offset];
|
||||
//mapquality = read.getMappingQuality();
|
||||
if (!ReadsToDiscard.contains(read.getReadName()) && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
|
||||
|
||||
//consider base in likelihood calculations if it looks good and has high mapping score
|
||||
G.add((byte)base, qual, read, offset);
|
||||
|
||||
for ( PileupElement p : context.getBasePileup() ) {
|
||||
byte base = p.getBase();
|
||||
if (!ReadsToDiscard.contains(p.getRead().getReadName()) && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
|
||||
|
||||
// TODO-- move this outside, e.g. to ReadBackedPileup
|
||||
HashSet<PileupElement> fragment = new HashSet<PileupElement>();
|
||||
fragment.add(p);
|
||||
G.add(new PerFragmentPileupElement(fragment), true, false);
|
||||
//if (DEBUG){
|
||||
if (base == 'A'){numAs++;}
|
||||
else if (base == 'C'){numCs++;}
|
||||
|
|
@ -146,7 +143,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
//}
|
||||
|
||||
}
|
||||
return context.getReads().size();
|
||||
return context.size();
|
||||
}
|
||||
|
||||
public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum) {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
|
@ -41,6 +42,7 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -340,7 +342,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
ReadBackedPileup pileup = context.getPileup();
|
||||
|
||||
long loc = context.getPosition();
|
||||
if( context.getReads().size() > 0 ) {
|
||||
if( context.size() > 0 ) {
|
||||
//out.printf("RG for first read: %s%n",context.getReads().get(0).getReadName());
|
||||
int numAs = 0, numCs = 0, numGs = 0, numTs = 0,depth = 0;
|
||||
String c1 = "", c2 = "";
|
||||
|
|
@ -354,25 +356,23 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
|
||||
//Calculate posterior probabilities
|
||||
DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods();
|
||||
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
|
||||
|
||||
//Check for bad bases and ensure mapping quality myself. This works.
|
||||
for (int i = 0; i < reads.size(); i++) {
|
||||
read = reads.get(i);
|
||||
offset = offsets.get(i);
|
||||
base = (char)read.getReadBases()[offset];
|
||||
qual = read.getBaseQualities()[offset];
|
||||
mapquality = read.getMappingQuality();
|
||||
for ( PileupElement p : context.getBasePileup() ) {
|
||||
byte base = p.getBase();
|
||||
int mapquality = p.getMappingQual();
|
||||
if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
|
||||
if (ReadsToFilter.contains(read.getReadName())){
|
||||
String readname = p.getRead().getReadName();
|
||||
if (ReadsToFilter.contains(readname)){
|
||||
if (DEBUG){
|
||||
out.printf("\n%s %s %s %s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),base);
|
||||
out.printf("\n%s %s %s %s\n",readname,p.getRead().getAlignmentStart(),p.getRead().getAlignmentEnd(),base);
|
||||
}
|
||||
}else{
|
||||
//consider base in likelihood calculations if it looks good and has high mapping score
|
||||
G.add((byte)base, qual, read, offset);
|
||||
readname = read.getReadName();
|
||||
if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);}
|
||||
// TODO-- move this outside, e.g. to ReadBackedPileup
|
||||
HashSet<PileupElement> fragment = new HashSet<PileupElement>();
|
||||
fragment.add(p);
|
||||
G.add(new PerFragmentPileupElement(fragment), true, false);
|
||||
if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(p.getRead());}
|
||||
if (base == 'A'){numAs++; depth++;}
|
||||
else if (base == 'C'){numCs++; depth++;}
|
||||
else if (base == 'T'){numTs++; depth++;}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ public class
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("6c04cc01cf6bebe4bdbc025fd45c5559"));
|
||||
Arrays.asList("e7514b0f1f2df1ca42815b5c45775f36"));
|
||||
executeTest("testMultiSamplePilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -33,7 +33,7 @@ public class
|
|||
public void testMultiSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1,
|
||||
Arrays.asList("84efe068164891dbec7c85ff6cc33df3"));
|
||||
Arrays.asList("49ae129435063f47f0faf00337eb8bf7"));
|
||||
executeTest("testMultiSamplePilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -41,7 +41,7 @@ public class
|
|||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("c7decebadb35067a38b9be3c43c1fb76"));
|
||||
Arrays.asList("8da08fe12bc0d95e548fe63681997038"));
|
||||
executeTest("testSingleSamplePilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -51,7 +51,7 @@ public class
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "66bcb3f7b20d4fbe7849b616bfe69c0f";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "950ebf5eaa245131dc22aef211526681";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -78,7 +78,7 @@ public class
|
|||
|
||||
@Test
|
||||
public void testParallelization() {
|
||||
String md5 = "a0032a9952c94a55e0fa42e2dec33169";
|
||||
String md5 = "73845a29994d653a4d3f01b5467a3eed";
|
||||
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
|
||||
|
|
@ -105,12 +105,12 @@ public class
|
|||
@Test
|
||||
public void testParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-genotype", "4a623d4a68fba4f9f8d7e915af0cc450" );
|
||||
e.put( "-all_bases", "2ab5106795c70a63d8dd2353ee0f1427" );
|
||||
e.put( "--min_base_quality_score 26", "9c9be22923b9ba0d588f3d37385ae4b0" );
|
||||
e.put( "--min_mapping_quality_score 26", "aa36bb2f4fc5bd6b6c4206e0a084a577" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "4914ed25a8ca22c7aea983999cd6768a" );
|
||||
e.put( "--p_nonref_model GRID_SEARCH", "dd4fb1fb304e44b82c9cc3d4cc257172" );
|
||||
e.put( "-genotype", "683ce57f2fd3acd5f6fe7599c1ace169" );
|
||||
e.put( "-all_bases", "2ddf763c208602693cad942c9ccb804c" );
|
||||
e.put( "--min_base_quality_score 26", "5f1cfb9c7f82e6414d5db7aa344813ac" );
|
||||
e.put( "--min_mapping_quality_score 26", "6c3ad441f3a23ade292549b1dea80932" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "5ecaf4281410b67e8e2e164f2ea0d58a" );
|
||||
e.put( "--p_nonref_model GRID_SEARCH", "17ffb56d078fdde335a79773e9534ce7" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -124,12 +124,12 @@ public class
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
|
||||
Arrays.asList("dd4fb1fb304e44b82c9cc3d4cc257172"));
|
||||
Arrays.asList("17ffb56d078fdde335a79773e9534ce7"));
|
||||
executeTest("testConfidence1", spec1);
|
||||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
|
||||
Arrays.asList("eef4e314d0cdae669454232ffbbab8ea"));
|
||||
Arrays.asList("d49ec8c1476cecb8e3153894cc0f6662"));
|
||||
executeTest("testConfidence2", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -141,8 +141,8 @@ public class
|
|||
@Test
|
||||
public void testHeterozyosity() {
|
||||
HashMap<Double, String> e = new HashMap<Double, String>();
|
||||
e.put( 0.01, "1bbd2e24ddec902339eac481565d7f0a" );
|
||||
e.put( 1.0 / 1850, "472d2d6c375a7c6d2edb464a12f10742" );
|
||||
e.put( 0.01, "7e820c33a4c688148eec57342645c9b6" );
|
||||
e.put( 1.0 / 1850, "c1c48e0c4724b75f12936e22a8629457" );
|
||||
|
||||
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -165,7 +165,7 @@ public class
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("eb4144006eda780d69b6979817ceb58d"));
|
||||
Arrays.asList("5974d8c21d27d014e2d0bed695b0b42e"));
|
||||
|
||||
executeTest(String.format("testMultiTechnologies"), spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ public class HLACallerIntegrationTest extends WalkerTest {
|
|||
public void testCalculateBaseLikelihoods() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T CalculateBaseLikelihoods -I " + validationDataLocation + "NA12878.HISEQ.HLA.bam -R " + b36KGReference + " -L " + intervals + " -filter " + validationDataLocation + "HLA_HISEQ.filter -maxAllowedMismatches 6 -minRequiredMatches 0 -o %s", 1,
|
||||
Arrays.asList("98e64882f93bf7550457bee4182caab6"));
|
||||
Arrays.asList("921bb354f3877e5183ca31815546b9fd"));
|
||||
executeTest("test CalculateBaseLikelihoods", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue