Intermediate checking before code reorganization. Full blown support for empirical transition probs in SSG for all platforms. Support for defaultPlatform arg in SSG. Renaming classes for final cleanup
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1479 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6ab9ddf9f5
commit
5af4bb628b
|
|
@ -4,6 +4,11 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
import java.util.TreeMap;
|
||||
import java.util.EnumMap;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -13,25 +18,208 @@ import static java.lang.Math.pow;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotypeLikelihoods {
|
||||
private final static double[][] misCallProbs = new double[4][4];
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to manipulate machine platforms
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public enum SequencerPlatform {
|
||||
SOLEXA, // Solexa / Illumina
|
||||
ROCHE454, // 454
|
||||
SOLID, // SOLiD
|
||||
UNKNOWN // No idea -- defaulting to 1/3
|
||||
}
|
||||
|
||||
private static void addMisCall(char miscalledBase, char trueBase, double p) {
|
||||
private final static String SAM_PLATFORM_TAG = "PL";
|
||||
|
||||
private static TreeMap<String, SequencerPlatform> PLFieldToSequencerPlatform = new TreeMap<String, SequencerPlatform>();
|
||||
private static void bind(String s, SequencerPlatform x) {
|
||||
PLFieldToSequencerPlatform.put(s, x);
|
||||
PLFieldToSequencerPlatform.put(s.toUpperCase(), x);
|
||||
PLFieldToSequencerPlatform.put(s.toLowerCase(), x);
|
||||
}
|
||||
|
||||
//
|
||||
// Static list of platforms supported by this system.
|
||||
//
|
||||
static {
|
||||
bind("LS454", SequencerPlatform.ROCHE454);
|
||||
bind("454", SequencerPlatform.ROCHE454);
|
||||
bind("ILLUMINA", SequencerPlatform.SOLEXA);
|
||||
bind("solid", SequencerPlatform.SOLID);
|
||||
}
|
||||
|
||||
public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) {
|
||||
if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(sequencerString) ) {
|
||||
return PLFieldToSequencerPlatform.get(sequencerString);
|
||||
} else {
|
||||
return SequencerPlatform.UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to get at the transition tables themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A matrix of value i x j -> log10(p) where
|
||||
*
|
||||
* i - char of the miscalled base (i.e., A)
|
||||
* j - char of the presumed true base (i.e., C)
|
||||
* log10p - empirical probability p that A is actually C
|
||||
*
|
||||
* The table is available for each technology
|
||||
*/
|
||||
private final static EnumMap<SequencerPlatform, double[][]> log10pTrueGivenMiscall = new EnumMap<SequencerPlatform, double[][]>(SequencerPlatform.class);
|
||||
|
||||
private static void addMisCall(final SequencerPlatform pl, char miscalledBase, char trueBase, double p) {
|
||||
if ( ! log10pTrueGivenMiscall.containsKey(pl) )
|
||||
log10pTrueGivenMiscall.put(pl, new double[4][4]);
|
||||
|
||||
double[][] misCallProbs = log10pTrueGivenMiscall.get(pl);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||
misCallProbs[i][j] = log10(p);
|
||||
}
|
||||
|
||||
private static double getProbMiscallIsBase(char miscalledBase, char trueBase) {
|
||||
private static double getProbMiscallIsBase(SequencerPlatform pl, char miscalledBase, char trueBase) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||
|
||||
double logP = misCallProbs[i][j];
|
||||
double logP = log10pTrueGivenMiscall.get(pl)[i][j];
|
||||
if ( logP == 0.0 )
|
||||
throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%b", miscalledBase, trueBase));
|
||||
else
|
||||
return logP;
|
||||
}
|
||||
|
||||
private static void addSolexa() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLEXA;
|
||||
addMisCall(pl, 'A', 'C', 57.7/100.0);
|
||||
addMisCall(pl, 'A', 'G', 17.1/100.0);
|
||||
addMisCall(pl, 'A', 'T', 25.2/100.0);
|
||||
|
||||
addMisCall(pl, 'C', 'A', 34.9/100.0);
|
||||
addMisCall(pl, 'C', 'G', 11.3/100.0);
|
||||
addMisCall(pl, 'C', 'T', 53.9/100.0);
|
||||
|
||||
addMisCall(pl, 'G', 'A', 31.9/100.0);
|
||||
addMisCall(pl, 'G', 'C', 5.1/100.0);
|
||||
addMisCall(pl, 'G', 'T', 63.0/100.0);
|
||||
|
||||
addMisCall(pl, 'T', 'A', 45.8/100.0);
|
||||
addMisCall(pl, 'T', 'C', 22.1/100.0);
|
||||
addMisCall(pl, 'T', 'G', 32.0/100.0);
|
||||
}
|
||||
//
|
||||
// // TODO -- delete me for testing only
|
||||
// private static void addSolexa() {
|
||||
// SequencerPlatform pl = SequencerPlatform.SOLEXA;
|
||||
// addMisCall(pl, 'A', 'C', 59.2/100.0);
|
||||
// addMisCall(pl, 'A', 'G', 15.3/100.0);
|
||||
// addMisCall(pl, 'A', 'T', 25.6/100.0);
|
||||
//
|
||||
// addMisCall(pl, 'C', 'A', 54.2/100.0);
|
||||
// addMisCall(pl, 'C', 'G', 10.3/100.0);
|
||||
// addMisCall(pl, 'C', 'T', 35.5/100.0);
|
||||
//
|
||||
// addMisCall(pl, 'G', 'A', 26.4/100.0);
|
||||
// addMisCall(pl, 'G', 'C', 5.6/100.0);
|
||||
// addMisCall(pl, 'G', 'T', 68.0/100.0);
|
||||
//
|
||||
// addMisCall(pl, 'T', 'A', 41.8/100.0);
|
||||
// addMisCall(pl, 'T', 'C', 17.3/100.0);
|
||||
// addMisCall(pl, 'T', 'G', 40.9/100.0);
|
||||
//
|
||||
// }
|
||||
|
||||
private static void addSOLiD() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLID;
|
||||
addMisCall(pl, 'A', 'C', 18.7/100.0);
|
||||
addMisCall(pl, 'A', 'G', 42.5/100.0);
|
||||
addMisCall(pl, 'A', 'T', 38.7/100.0);
|
||||
|
||||
addMisCall(pl, 'C', 'A', 27.0/100.0);
|
||||
addMisCall(pl, 'C', 'G', 18.9/100.0);
|
||||
addMisCall(pl, 'C', 'T', 54.1/100.0);
|
||||
|
||||
addMisCall(pl, 'G', 'A', 61.0/100.0);
|
||||
addMisCall(pl, 'G', 'C', 15.7/100.0);
|
||||
addMisCall(pl, 'G', 'T', 23.2/100.0);
|
||||
|
||||
addMisCall(pl, 'T', 'A', 40.5/100.0);
|
||||
addMisCall(pl, 'T', 'C', 34.3/100.0);
|
||||
addMisCall(pl, 'T', 'G', 25.2/100.0);
|
||||
}
|
||||
|
||||
private static void add454() {
|
||||
SequencerPlatform pl = SequencerPlatform.ROCHE454;
|
||||
addMisCall(pl, 'A', 'C', 23.2/100.0);
|
||||
addMisCall(pl, 'A', 'G', 42.6/100.0);
|
||||
addMisCall(pl, 'A', 'T', 34.3/100.0);
|
||||
|
||||
addMisCall(pl, 'C', 'A', 19.7/100.0);
|
||||
addMisCall(pl, 'C', 'G', 8.4/100.0);
|
||||
addMisCall(pl, 'C', 'T', 71.9/100.0);
|
||||
|
||||
addMisCall(pl, 'G', 'A', 71.5/100.0);
|
||||
addMisCall(pl, 'G', 'C', 6.6/100.0);
|
||||
addMisCall(pl, 'G', 'T', 21.9/100.0);
|
||||
|
||||
addMisCall(pl, 'T', 'A', 43.8/100.0);
|
||||
addMisCall(pl, 'T', 'C', 37.8/100.0);
|
||||
addMisCall(pl, 'T', 'G', 18.5/100.0);
|
||||
}
|
||||
|
||||
private static void addUnknown() {
|
||||
SequencerPlatform pl = SequencerPlatform.UNKNOWN;
|
||||
for ( char b1 : BaseUtils.BASES ) {
|
||||
for ( char b2 : BaseUtils.BASES ) {
|
||||
if ( b1 != b2 )
|
||||
addMisCall(pl, b1, b2, 1.0/3.0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
addSolexa();
|
||||
add454();
|
||||
addSOLiD();
|
||||
addUnknown();
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// The actual objects themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private boolean raiseErrorOnUnknownPlatform = true;
|
||||
private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN;
|
||||
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods() { super(); }
|
||||
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); }
|
||||
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors, boolean raiseErrorOnUnknownPlatform) {
|
||||
super(priors);
|
||||
this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform;
|
||||
}
|
||||
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors, SequencerPlatform assumeUnknownPlatformsAreThis) {
|
||||
super(priors);
|
||||
|
||||
if ( assumeUnknownPlatformsAreThis != null ) {
|
||||
raiseErrorOnUnknownPlatform = false;
|
||||
defaultPlatform = assumeUnknownPlatformsAreThis;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
|
|
@ -40,73 +228,35 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotype
|
|||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
// NA12878 CHR1 solexa table
|
||||
//
|
||||
// True base A C G T Totals
|
||||
// Error base Counts 1190455 686187 685080 1227860 3789582
|
||||
// A 938661 0.0% 48.6% 16.3% 35.1% 100.00%
|
||||
// C 955469 60.6% 0.0% 7.9% 31.5% 100.00%
|
||||
// G 964246 30.2% 7.8% 0.0% 62.0% 100.00%
|
||||
// T 931206 34.4% 16.6% 49.0% 0.0% 100.00%
|
||||
//
|
||||
// V2 table -- didn't incorporate complements based on read orientation
|
||||
// static {
|
||||
// addMisCall('A', 'C', 48.6/100.0);
|
||||
// addMisCall('A', 'G', 16.3/100.0);
|
||||
// addMisCall('A', 'T', 35.1/100.0);
|
||||
//
|
||||
// addMisCall('C', 'A', 60.6/100.0);
|
||||
// addMisCall('C', 'G', 7.9/100.0);
|
||||
// addMisCall('C', 'T', 31.5/100.0);
|
||||
//
|
||||
// addMisCall('G', 'A', 30.2/100.0);
|
||||
// addMisCall('G', 'C', 7.8/100.0);
|
||||
// addMisCall('G', 'T', 62.0/100.0);
|
||||
//
|
||||
// addMisCall('T', 'A', 34.4/100.0);
|
||||
// addMisCall('T', 'C', 16.6/100.0);
|
||||
// addMisCall('T', 'G', 49.9/100.0);
|
||||
// }
|
||||
|
||||
//True base A C G T Totals
|
||||
//Error base Counts 1209203 718053 653214 1209112 3789582
|
||||
//A 809944 0.0% 59.2% 15.3% 25.6% 100.00%
|
||||
//C 934612 54.2% 0.0% 10.3% 35.5% 100.00%
|
||||
//G 985103 26.4% 5.6% 0.0% 68.0% 100.00%
|
||||
//T 1059923 41.8% 17.3% 40.9% 0.0% 100.00%
|
||||
static {
|
||||
addMisCall('A', 'C', 59.2/100.0);
|
||||
addMisCall('A', 'G', 15.3/100.0);
|
||||
addMisCall('A', 'T', 25.6/100.0);
|
||||
|
||||
addMisCall('C', 'A', 54.2/100.0);
|
||||
addMisCall('C', 'G', 10.3/100.0);
|
||||
addMisCall('C', 'T', 35.5/100.0);
|
||||
|
||||
addMisCall('G', 'A', 26.4/100.0);
|
||||
addMisCall('G', 'C', 5.6/100.0);
|
||||
addMisCall('G', 'T', 68.0/100.0);
|
||||
|
||||
addMisCall('T', 'A', 41.8/100.0);
|
||||
addMisCall('T', 'C', 17.3/100.0);
|
||||
addMisCall('T', 'G', 40.9/100.0);
|
||||
}
|
||||
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods() { super(); }
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); }
|
||||
|
||||
protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, boolean fwdStrand) {
|
||||
if ( fwdStrand ) {
|
||||
return getProbMiscallIsBase(observedBase, chromBase);
|
||||
} else {
|
||||
double v = getProbMiscallIsBase(BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
|
||||
//System.out.printf("R: %c/%c => %f compared to %f%n", observedBase, chromBase, pow(10, getProbMiscallIsBase(observedBase, chromBase)), pow(10, v));
|
||||
return v;
|
||||
protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
|
||||
final String readGroupString = ((String)read.getAttribute("RG"));
|
||||
SAMReadGroupRecord readGroup = readGroupString == null ? null : read.getHeader().getReadGroup(readGroupString);
|
||||
final String platformName = readGroup == null ? null : (String)readGroup.getAttribute(SAM_PLATFORM_TAG);
|
||||
SequencerPlatform pl = standardizeSequencerPlatform(platformName);
|
||||
|
||||
if ( pl == SequencerPlatform.UNKNOWN ) {
|
||||
if ( raiseErrorOnUnknownPlatform )
|
||||
throw new RuntimeException("Unknown Sequencer platform for read " + read.format());
|
||||
else {
|
||||
pl = defaultPlatform;
|
||||
}
|
||||
}
|
||||
|
||||
//System.out.printf("%s for %s%n", pl, read);
|
||||
|
||||
double log10p = 0.0;
|
||||
if ( fwdStrand ) {
|
||||
log10p = getProbMiscallIsBase(pl, observedBase, chromBase);
|
||||
} else {
|
||||
log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
|
||||
}
|
||||
|
||||
//System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() );
|
||||
|
||||
return log10p;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,6 +51,9 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
|
||||
private DiploidGenotypePriors priors = null;
|
||||
|
||||
private boolean verbose = false;
|
||||
private int minQScoreToInclude = 0;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*/
|
||||
|
|
@ -85,6 +88,23 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
posteriors = priors.getPriors().clone(); // posteriors are all the priors
|
||||
}
|
||||
|
||||
|
||||
public void setVerbose(boolean v) {
|
||||
verbose = v;
|
||||
}
|
||||
|
||||
public boolean isVerbose() {
|
||||
return verbose;
|
||||
}
|
||||
|
||||
public int getMinQScoreToInclude() {
|
||||
return minQScoreToInclude;
|
||||
}
|
||||
|
||||
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||
this.minQScoreToInclude = minQScoreToInclude;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
|
||||
* @return
|
||||
|
|
@ -142,23 +162,6 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Are we ignoring Q0 bases during calculations?
|
||||
* @return
|
||||
*/
|
||||
public boolean isFilteringQ0Bases() {
|
||||
return filterQ0Bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable / disable filtering of Q0 bases. Enabled by default
|
||||
*
|
||||
* @param filterQ0Bases
|
||||
*/
|
||||
public void filterQ0Bases(boolean filterQ0Bases) {
|
||||
this.filterQ0Bases = filterQ0Bases;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
|
|
@ -175,58 +178,8 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
* @param qualityScore
|
||||
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
|
||||
*/
|
||||
public int add(char observedBase, byte qualityScore, boolean fwdStrand) {
|
||||
return add(observedBase, qualityScore, fwdStrand, true, false);
|
||||
}
|
||||
|
||||
public int add(char observedBase, byte qualityScore, boolean fwdStrand, boolean enableCache, boolean verbose) {
|
||||
if ( badBase(observedBase) ) {
|
||||
throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore));
|
||||
}
|
||||
|
||||
// TODO -- we should probably filter Q1 bases too
|
||||
if (qualityScore <= 0) {
|
||||
if ( isFilteringQ0Bases() ) {
|
||||
return 0;
|
||||
} else {
|
||||
qualityScore = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
|
||||
NewHotnessGenotypeLikelihoods cached = null;
|
||||
if ( enableCache ) {
|
||||
if ( ! inCache(observedBase, qualityScore, FIXED_PLOIDY, fwdStrand) ) {
|
||||
cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, fwdStrand);
|
||||
} else {
|
||||
cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, fwdStrand);
|
||||
}
|
||||
}
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, fwdStrand) : 0.0;
|
||||
double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0;
|
||||
double likelihood = enableCache ? likelihoodCached : likelihoodCalc;
|
||||
|
||||
//if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) {
|
||||
// System.out.printf("ERROR: Likelihoods not equal is cache=%f != calc=%f for %c %d %s%n",
|
||||
// likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString());
|
||||
//}
|
||||
if ( verbose )
|
||||
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f%n", observedBase, g, qualityScore, fwdStrand ? "+" : "-", likelihood);
|
||||
|
||||
likelihoods[g.ordinal()] += likelihood;
|
||||
posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
if ( verbose ) {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
|
||||
System.out.println();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
return 1;
|
||||
public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
return reallyAdd(observedBase, qualityScore, read, offset, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -237,7 +190,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
* @param pileup
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean verbose) {
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases) {
|
||||
int n = 0;
|
||||
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
|
|
@ -246,20 +199,67 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
if ( ! ignoreBadBases || ! badBase(base) ) {
|
||||
n += add(base, qual, ! read.getReadNegativeStrandFlag(), true, verbose);
|
||||
n += add(base, qual, read, offset);
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases) {
|
||||
return add(pileup, ignoreBadBases, false);
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false);
|
||||
}
|
||||
|
||||
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false);
|
||||
private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCache) {
|
||||
if ( badBase(observedBase) ) {
|
||||
throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore));
|
||||
}
|
||||
|
||||
int nBasesAdded = 0; // the result -- how many bases did we add?
|
||||
|
||||
if ( qualityScore > getMinQScoreToInclude() ) {
|
||||
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
|
||||
NewHotnessGenotypeLikelihoods cached = null;
|
||||
if ( enableCache ) {
|
||||
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) {
|
||||
cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset );
|
||||
} else {
|
||||
cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset );
|
||||
}
|
||||
}
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, read, offset) : 0.0;
|
||||
double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0;
|
||||
double likelihood = enableCache ? likelihoodCached : likelihoodCalc;
|
||||
|
||||
//if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) {
|
||||
// System.out.printf("ERROR: Likelihoods not equal is cache=%f != calc=%f for %c %d %s%n",
|
||||
// likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString());
|
||||
//}
|
||||
|
||||
if ( isVerbose() ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
|
||||
likelihoods[g.ordinal()] += likelihood;
|
||||
posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
if ( isVerbose() ) {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
|
||||
System.out.println();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
nBasesAdded = 1;
|
||||
}
|
||||
|
||||
return nBasesAdded;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -272,34 +272,56 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
final static NewHotnessGenotypeLikelihoods[][][][] cache = new NewHotnessGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
||||
static int cacheSize = 0;
|
||||
|
||||
private NewHotnessGenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
|
||||
SAMRecord read, int offset, NewHotnessGenotypeLikelihoods val ) {
|
||||
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag());
|
||||
|
||||
if ( val != null )
|
||||
cache[i][j][k][x] = val;
|
||||
|
||||
return cache[i][j][k][x];
|
||||
}
|
||||
|
||||
private NewHotnessGenotypeLikelihoods getCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
return getSetCache( observedBase, qualityScore, ploidy, read, offset, null );
|
||||
}
|
||||
|
||||
private void setCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, NewHotnessGenotypeLikelihoods val ) {
|
||||
getSetCache( observedBase, qualityScore, ploidy, read, offset, val );
|
||||
}
|
||||
|
||||
private int strandIndex(boolean fwdStrand) {
|
||||
return fwdStrand ? 0 : 1;
|
||||
}
|
||||
|
||||
private boolean inCache( char observedBase, byte qualityScore, int ploidy, boolean fwdStrand ) {
|
||||
return cache[BaseUtils.simpleBaseToBaseIndex(observedBase)][qualityScore][ploidy][strandIndex(fwdStrand)] != null;
|
||||
private boolean inCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
return getCache(observedBase, qualityScore, ploidy, read, offset) != null;
|
||||
}
|
||||
|
||||
private NewHotnessGenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, boolean fwdStrand ) {
|
||||
if ( ! inCache(observedBase, qualityScore, ploidy, fwdStrand ) )
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, fwdstrand=%b",
|
||||
observedBase, qualityScore, ploidy, fwdStrand));
|
||||
private NewHotnessGenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
if ( ! inCache(observedBase, qualityScore, ploidy, read, offset ) )
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
|
||||
return cache[BaseUtils.simpleBaseToBaseIndex(observedBase)][qualityScore][ploidy][strandIndex(fwdStrand)];
|
||||
return getCache(observedBase, qualityScore, ploidy, read, offset);
|
||||
}
|
||||
|
||||
private NewHotnessGenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, boolean fwdStrand ) {
|
||||
if ( inCache(observedBase, qualityScore, ploidy, fwdStrand ) )
|
||||
throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, fwdstrand=%b",
|
||||
observedBase, qualityScore, ploidy, fwdStrand));
|
||||
private NewHotnessGenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
if ( inCache(observedBase, qualityScore, ploidy, read, offset ) )
|
||||
throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
|
||||
// create a new genotype likelihoods object and add this single base to it -- now we have a cached result
|
||||
try {
|
||||
NewHotnessGenotypeLikelihoods g = (NewHotnessGenotypeLikelihoods)this.clone();
|
||||
g.initialize();
|
||||
g.add(observedBase, qualityScore, fwdStrand, false, false);
|
||||
g.reallyAdd(observedBase, qualityScore, read, offset, false);
|
||||
|
||||
cache[BaseUtils.simpleBaseToBaseIndex(observedBase)][qualityScore][ploidy][strandIndex(fwdStrand)] = g;
|
||||
setCache(observedBase, qualityScore, ploidy, read, offset, g);
|
||||
cacheSize++;
|
||||
|
||||
//System.out.printf("Caching %c %d %d %d %b (%d total entries)%n", observedBase, BaseUtils.simpleBaseToBaseIndex(observedBase), qualityScore, ploidy, fwdStrand, cacheSize);
|
||||
|
|
@ -403,19 +425,20 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
* @param qual
|
||||
* @return
|
||||
*/
|
||||
protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, boolean fwdStrand) {
|
||||
protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, SAMRecord read, int offset ) {
|
||||
if (qual == 0) { // zero quals are wrong
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d", observedBase, g, qual));
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d at %d in %s",
|
||||
observedBase, g, qual, offset, read));
|
||||
}
|
||||
|
||||
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
|
||||
double p_base = 0.0;
|
||||
p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, fwdStrand));
|
||||
p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, fwdStrand));
|
||||
p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, read, offset ));
|
||||
p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, read, offset ));
|
||||
return log10(p_base);
|
||||
}
|
||||
|
||||
protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, boolean fwdStrand) {
|
||||
protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, SAMRecord read, int offset) {
|
||||
double logP = 0.0;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
|
|
@ -425,7 +448,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, fwdStrand);
|
||||
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
|
||||
}
|
||||
|
||||
// adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
|
||||
|
|
@ -441,7 +464,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement
|
|||
*/
|
||||
protected static final double log103 = log10(3.0);
|
||||
|
||||
protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, boolean fwdStrand) {
|
||||
protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) {
|
||||
return -log103; // currently equivalent to e/3 model
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,16 +46,10 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
@Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false)
|
||||
public boolean VERBOSE = false;
|
||||
|
||||
// todo - print out priors at the start of SSG with .INFO
|
||||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
||||
|
||||
// todo -- remove dbSNP awareness until we understand how it should be used
|
||||
//@Argument(fullName = "priors_dbsnp", shortName = "pdbsnp", doc = "Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required = false)
|
||||
//public String PRIORS_DBSNP = "0.999,1e-3,1e-5";
|
||||
|
||||
@Argument(fullName = "keepQ0Bases", shortName = "keepQ0Bases", doc = "If true, then Q0 bases will be included in the genotyping calculation, and treated as Q1 -- this is really not a good idea", required = false)
|
||||
public boolean keepQ0Bases = false;
|
||||
|
||||
/**
|
||||
/**
|
||||
* Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
|
|
@ -84,11 +78,16 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
char ref = refContext.getBase();
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
|
||||
|
||||
NewHotnessGenotypeLikelihoods G = useEmpiricalTransitions ? new EmpiricalSubstitutionGenotypeLikelihoods(priors) : new NewHotnessGenotypeLikelihoods(priors);
|
||||
// setup GenotypeLike object
|
||||
NewHotnessGenotypeLikelihoods G;
|
||||
if ( useEmpiricalTransitions )
|
||||
G = new EmpiricalSubstitutionGenotypeLikelihoods(priors, defaultPlatform);
|
||||
else
|
||||
G = new NewHotnessGenotypeLikelihoods(priors);
|
||||
|
||||
G.filterQ0Bases(! keepQ0Bases);
|
||||
G.setVerbose(VERBOSE);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
G.add(pileup, true, VERBOSE);
|
||||
G.add(pileup, true);
|
||||
G.validate();
|
||||
|
||||
// lets setup the locus
|
||||
|
|
|
|||
|
|
@ -208,7 +208,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
if (base == 'T'){numTs++;}
|
||||
if (base == 'G'){numGs++;}
|
||||
//consider base in likelihood calculations if it looks good and has high mapping score
|
||||
G.add(base, qual, ! read.getReadNegativeStrandFlag());
|
||||
G.add(base, qual, read, offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue