Miscellaneous improvements to indel genotyper:
- Add a simple calculation model for Pr(R|H) that doesn't rely on Dindel's HMM model. MUCH faster, at a cost of slightly worse performance since we're more sensitive to bad reads coming from sequencing artifacts (add -simple to command line to activate). - Add debug option to calculation model so that we can optionally output useful info on current read being evaluated. (add -debugout to commandline). - Small performance improvement: instead of evaluating haplotype to the right of indel (just with a 5 base addition to the left), it seems better to center the indel and to add context evenly around event. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4257 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
61d511f601
commit
da2e879bbc
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -63,6 +64,15 @@ public class HaplotypeIndelErrorModel {
|
||||||
private final double logInsertionEndProbability;
|
private final double logInsertionEndProbability;
|
||||||
private final double logOneMinusInsertionEndProbability;
|
private final double logOneMinusInsertionEndProbability;
|
||||||
|
|
||||||
|
private boolean DEBUG = false;
|
||||||
|
private boolean doSimpleCalculationModel = false;
|
||||||
|
|
||||||
|
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength,
|
||||||
|
boolean dosimple, boolean deb) {
|
||||||
|
this(mrdl, insStart, insEnd, alpha, haplotypeSize, maxReadLength);
|
||||||
|
this.DEBUG = deb;
|
||||||
|
this.doSimpleCalculationModel = dosimple;
|
||||||
|
}
|
||||||
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength) {
|
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength) {
|
||||||
this.maxReadDeletionLength = mrdl;
|
this.maxReadDeletionLength = mrdl;
|
||||||
this.noDeletionProbability = 1-alpha;
|
this.noDeletionProbability = 1-alpha;
|
||||||
|
|
@ -115,56 +125,133 @@ public class HaplotypeIndelErrorModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {
|
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {
|
||||||
|
long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart();
|
||||||
|
long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd();
|
||||||
final long readStartPosition = read.getAlignmentStart();
|
final long readStartPosition = read.getAlignmentStart();
|
||||||
final long haplotypeStartPosition = haplotype.getStartPosition();
|
final long haplotypeStartPosition = haplotype.getStartPosition();
|
||||||
final long readEndPosition = read.getUnclippedEnd();
|
//final long readEndPosition = read.getUnclippedEnd();
|
||||||
final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1;
|
//final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1;
|
||||||
final byte[] readBases = read.getReadBases();
|
|
||||||
final byte[] readQuals = read.getBaseQualities();
|
byte[] readBases = Arrays.copyOfRange(read.getReadBases(),(int)numStartClippedBases,
|
||||||
|
(int)(read.getReadBases().length-numEndClippedBases));
|
||||||
|
|
||||||
|
byte[] readQuals = Arrays.copyOfRange(read.getBaseQualities(),(int)numStartClippedBases,
|
||||||
|
(int)(read.getReadBases().length-numEndClippedBases));
|
||||||
|
|
||||||
double[] previousPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH];
|
double[] previousPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH];
|
||||||
int readStartIdx, initialIndexInHaplotype;
|
int readStartIdx, initialIndexInHaplotype;
|
||||||
|
|
||||||
/* List<AlignmentBlock> b = read.getAlignmentBlocks();
|
|
||||||
// TODO temp hack, dont take realigned reads for now
|
|
||||||
if (b.size()>1) {
|
|
||||||
rrc++;
|
|
||||||
//System.out.format("found realigned read %d\n", rrc);
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
// case 1: read started before haplotype start position, we need to iterate only over remainder of read
|
|
||||||
if (readStartPosition < haplotypeStartPosition) {
|
if (readStartPosition < haplotypeStartPosition) {
|
||||||
if (readEndPosition<= haplotypeEndPosition)
|
readStartIdx = (int)(haplotypeStartPosition- readStartPosition);
|
||||||
readStartIdx = (int)(haplotypeStartPosition - (readEndPosition - read.getReadBases().length)-1);
|
|
||||||
else
|
|
||||||
readStartIdx = (int)(haplotypeStartPosition- readStartPosition);
|
|
||||||
initialIndexInHaplotype = 0; // implicit above
|
initialIndexInHaplotype = 0; // implicit above
|
||||||
|
if ((readStartPosition+readBases.length-1) < haplotypeStartPosition)
|
||||||
|
// when ignoring read CIGAR alignment, this read actually didn't overlap with haplotype:
|
||||||
|
// just ignore read
|
||||||
|
return INFINITE;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
readStartIdx = 0;
|
readStartIdx = 0;
|
||||||
initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition);
|
initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("READ: "+read.getReadName());
|
||||||
|
|
||||||
|
System.out.println(haplotypeStartPosition);
|
||||||
|
System.out.println(readStartPosition);
|
||||||
|
System.out.println(readStartIdx);
|
||||||
|
System.out.println(initialIndexInHaplotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
double pRead = 0;
|
||||||
|
if (doSimpleCalculationModel) {
|
||||||
|
// No Viterbi algorithm - assume no sequencing indel artifacts,
|
||||||
|
// so we can collapse computations and pr(read | haplotype) is just probability of observing overlap
|
||||||
|
// of read with haplotype.
|
||||||
|
int haplotypeIndex = initialIndexInHaplotype;
|
||||||
|
double c = 0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability;
|
||||||
|
// compute likelihood of portion of base to the left of the haplotype
|
||||||
|
for (int indR=readStartIdx-1; indR >= 0; indR--) {
|
||||||
|
byte readBase = readBases[indR];
|
||||||
|
byte readQual = readQuals[indR];
|
||||||
|
if (readQual <= 2)
|
||||||
|
continue;
|
||||||
|
double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0);
|
||||||
|
|
||||||
|
// pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
|
||||||
|
pRead += pBaseRead;
|
||||||
|
|
||||||
|
}
|
||||||
|
//System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead);
|
||||||
|
|
||||||
|
for (int indR=readStartIdx; indR < readBases.length; indR++) {
|
||||||
|
byte readBase = readBases[indR];
|
||||||
|
byte readQual = readQuals[indR];
|
||||||
|
|
||||||
|
byte haplotypeBase;
|
||||||
|
if (haplotypeIndex < RIGHT_ALIGN_INDEX)
|
||||||
|
haplotypeBase = haplotype.getBasesAsBytes()[haplotypeIndex];
|
||||||
|
else
|
||||||
|
haplotypeBase = (byte)0; // dummy
|
||||||
|
|
||||||
|
double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0);
|
||||||
|
if (haplotypeBase != 0)
|
||||||
|
pBaseRead += c;
|
||||||
|
|
||||||
|
// pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
|
||||||
|
if (readQual > 3)
|
||||||
|
pRead += pBaseRead;
|
||||||
|
haplotypeIndex++;
|
||||||
|
if (haplotypeIndex >= haplotype.getBasesAsBytes().length)
|
||||||
|
haplotypeIndex = RIGHT_ALIGN_INDEX;
|
||||||
|
//System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead);
|
||||||
|
}
|
||||||
|
//System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead);
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(read.getReadName());
|
||||||
|
System.out.print("Haplotype:");
|
||||||
|
|
||||||
|
for (int k=0; k <haplotype.getBasesAsBytes().length; k++) {
|
||||||
|
System.out.format("%c ", haplotype.getBasesAsBytes()[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.print("Read bases: ");
|
||||||
|
for (int k=0; k <readBases.length; k++) {
|
||||||
|
System.out.format("%c ", readBases[k]);
|
||||||
|
}
|
||||||
|
System.out.format("\nLikelihood:%f\n",pRead);
|
||||||
|
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
if (read.getReadName().contains("106880")) {
|
||||||
|
|
||||||
|
System.out.println("aca");
|
||||||
|
|
||||||
|
System.out.println("Haplotype:");
|
||||||
|
|
||||||
|
for (int k=initialIndexInHaplotype; k <haplotype.getBasesAsBytes().length; k++) {
|
||||||
|
System.out.format("%c ", haplotype.getBasesAsBytes()[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.println("Read bases: ");
|
||||||
|
for (int k=readStartIdx; k <readBases.length; k++) {
|
||||||
|
System.out.format("%c ", readBases[k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
} */
|
||||||
|
return pRead;
|
||||||
|
|
||||||
|
}
|
||||||
// initialize path metric array to hard-code certainty that initial position is aligned
|
// initialize path metric array to hard-code certainty that initial position is aligned
|
||||||
for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++)
|
for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++)
|
||||||
pathMetricArray[k] = INFINITE;
|
pathMetricArray[k] = INFINITE;
|
||||||
|
|
||||||
pathMetricArray[initialIndexInHaplotype] = 0;
|
pathMetricArray[initialIndexInHaplotype] = 0;
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.println(read.getReadName());
|
|
||||||
|
|
||||||
System.out.println(haplotypeStartPosition);
|
|
||||||
System.out.println(readStartPosition);
|
|
||||||
System.out.println(readStartIdx);
|
|
||||||
System.out.println(initialIndexInHaplotype);
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Update path metric computations based on branch metric (Add/Compare/Select operations)
|
// Update path metric computations based on branch metric (Add/Compare/Select operations)
|
||||||
// do forward direction first, ie from anchor to end of read
|
// do forward direction first, ie from anchor to end of read
|
||||||
// outer loop
|
// outer loop
|
||||||
|
|
@ -199,7 +286,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
pathMetricArray[k] = INFINITE;
|
pathMetricArray[k] = INFINITE;
|
||||||
|
|
||||||
pathMetricArray[initialIndexInHaplotype] = 0;
|
pathMetricArray[initialIndexInHaplotype] = 0;
|
||||||
|
|
||||||
// do now backward direction (from anchor to start of read)
|
// do now backward direction (from anchor to start of read)
|
||||||
// outer loop
|
// outer loop
|
||||||
for (int indR=readStartIdx-1; indR >= 0; indR--) {
|
for (int indR=readStartIdx-1; indR >= 0; indR--) {
|
||||||
|
|
@ -223,42 +310,44 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
|
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
|
||||||
// in log-likelihood of best state, this isn't really necessary.
|
// in log-likelihood of best state, this isn't really necessary.
|
||||||
/*
|
if (DEBUG) {
|
||||||
int[] bestIndexArray = new int[readBases.length+1];
|
|
||||||
|
|
||||||
|
|
||||||
int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray);
|
int[] bestIndexArray = new int[readBases.length+1];
|
||||||
bestIndexArray[readBases.length] = bestIndex;
|
|
||||||
|
|
||||||
for (int k=readBases.length-1; k>=0; k--) {
|
|
||||||
bestIndex = bestStateIndexArray[k][bestIndex];
|
int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray);
|
||||||
bestIndexArray[k] = bestIndex;
|
bestIndexArray[readBases.length] = bestIndex;
|
||||||
|
|
||||||
|
for (int k=readBases.length-1; k>=0; k--) {
|
||||||
|
bestIndex = bestStateIndexArray[k][bestIndex];
|
||||||
|
bestIndexArray[k] = bestIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println(read.getReadName());
|
||||||
|
System.out.print("Haplotype:");
|
||||||
|
|
||||||
|
for (int k=0; k <haplotype.getBasesAsBytes().length; k++) {
|
||||||
|
System.out.format("%c ", haplotype.getBasesAsBytes()[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.print("Read bases: ");
|
||||||
|
for (int k=0; k <readBases.length; k++) {
|
||||||
|
System.out.format("%c ", readBases[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.print("Alignment: ");
|
||||||
|
for (int k=0; k <readBases.length; k++) {
|
||||||
|
System.out.format("%d ", bestIndexArray[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
System.out.println(read.getReadName());
|
|
||||||
System.out.print("Haplotype:");
|
|
||||||
|
|
||||||
for (int k=0; k <haplotype.getBasesAsBytes().length; k++) {
|
|
||||||
System.out.format("%c ", haplotype.getBasesAsBytes()[k]);
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
|
|
||||||
System.out.print("Read bases: ");
|
|
||||||
for (int k=0; k <readBases.length; k++) {
|
|
||||||
System.out.format("%c ", readBases[k]);
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
|
|
||||||
System.out.print("Alignment: ");
|
|
||||||
for (int k=0; k <readBases.length; k++) {
|
|
||||||
System.out.format("%d ", bestIndexArray[k]);
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
|
|
||||||
*/
|
|
||||||
// now just take optimum along all path metrics: that's the log likelihood of best alignment
|
// now just take optimum along all path metrics: that's the log likelihood of best alignment
|
||||||
double backwardMetric = MathUtils.arrayMin(pathMetricArray);
|
double backwardMetric = MathUtils.arrayMin(pathMetricArray);
|
||||||
|
if (DEBUG)
|
||||||
|
System.out.format("Likelihood: %5.4f\n", forwardMetric + backwardMetric);
|
||||||
return forwardMetric + backwardMetric;
|
return forwardMetric + backwardMetric;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -313,7 +402,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int indXold = indX+1; indXold <=this.maxReadDeletionLength + indX; indXold++){
|
for (int indXold = indX+1; indXold <=this.maxReadDeletionLength + indX; indXold++){
|
||||||
|
|
@ -390,7 +479,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
private double getProbabilityOfReadBaseGivenXandI(byte haplotypeBase, byte readBase, byte readQual, int indX, int indI) {
|
private double getProbabilityOfReadBaseGivenXandI(byte haplotypeBase, byte readBase, byte readQual, int indX, int indI) {
|
||||||
|
|
||||||
|
|
||||||
if (indX == this.haplotypeSize || indX == 0) {
|
if (indX == RIGHT_ALIGN_INDEX || indX == LEFT_ALIGN_INDEX) {
|
||||||
// X = L or R
|
// X = L or R
|
||||||
double baseProb = QualityUtils.qualToProb(readQual);
|
double baseProb = QualityUtils.qualToProb(readQual);
|
||||||
return probToQual(baseProb);
|
return probToQual(baseProb);
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||||
|
|
||||||
|
|
||||||
@Reference(window=@Window(start=-10,stop=80))
|
@Reference(window=@Window(start=-100,stop=100))
|
||||||
@Allows({DataSource.READS, DataSource.REFERENCE})
|
@Allows({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
@Output
|
@Output
|
||||||
|
|
@ -73,6 +73,14 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
int HAPLOTYPE_SIZE = 40;
|
int HAPLOTYPE_SIZE = 40;
|
||||||
|
|
||||||
|
|
||||||
|
@Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false)
|
||||||
|
double indelHeterozygosity = (double)1.0/6500.0;
|
||||||
|
|
||||||
|
@Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false)
|
||||||
|
boolean doSimple = false;
|
||||||
|
|
||||||
|
@Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false)
|
||||||
|
boolean DEBUG = false;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean generateExtendedEvents() { return true; }
|
public boolean generateExtendedEvents() { return true; }
|
||||||
|
|
@ -83,7 +91,7 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
|
|
||||||
private HaplotypeIndelErrorModel model;
|
private HaplotypeIndelErrorModel model;
|
||||||
|
|
||||||
private static final int MAX_READ_LENGTH = 200; // TODO- make this dynamic
|
private static final int MAX_READ_LENGTH = 1000; // TODO- make this dynamic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -91,20 +99,12 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability,
|
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability,
|
||||||
insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH);
|
insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH, doSimple, DEBUG);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* private void countIndels(ReadBackedExtendedEventPileup p) {
|
|
||||||
for ( ExtendedEventPileupElement pe : p.toExtendedIterable() ) {
|
|
||||||
if ( ! pe.isIndel() ) continue;
|
|
||||||
if ( pe.getEventLength() > MAX_LENGTH ) continue;
|
|
||||||
if ( pe.isInsertion() ) insCounts[pe.getEventLength()-1]++;
|
|
||||||
else delCounts[pe.getEventLength()-1]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
if ( tracker == null )
|
if ( tracker == null )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -132,6 +132,36 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
|
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
|
||||||
|
|
||||||
|
|
||||||
|
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||||
|
// In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption),
|
||||||
|
// and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of
|
||||||
|
// another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances
|
||||||
|
// by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval
|
||||||
|
// of size L is Poisson-distributed with parameter lambda = theta*L.
|
||||||
|
|
||||||
|
// Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it
|
||||||
|
// by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and
|
||||||
|
// pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda.
|
||||||
|
|
||||||
|
// We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006)
|
||||||
|
// and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)),
|
||||||
|
// taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one.
|
||||||
|
// so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L).
|
||||||
|
// Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval)
|
||||||
|
// and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be
|
||||||
|
// added to ref hypothesis.
|
||||||
|
// Equation above is prior model.
|
||||||
|
int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); // assume only one alt allele for now
|
||||||
|
if (eventLength<0)
|
||||||
|
eventLength = - eventLength;
|
||||||
|
|
||||||
|
double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity;
|
||||||
|
double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
|
||||||
|
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
|
||||||
|
|
||||||
|
haplotypeLikehoodMatrix[0][1] = altPrior;
|
||||||
|
haplotypeLikehoodMatrix[1][1] = 2*altPrior;
|
||||||
|
|
||||||
int bestIndexI =0, bestIndexJ=0;
|
int bestIndexI =0, bestIndexJ=0;
|
||||||
for (int i=0; i < haplotypesInVC.size(); i++) {
|
for (int i=0; i < haplotypesInVC.size(); i++) {
|
||||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
for (int j=i; j < haplotypesInVC.size(); j++){
|
||||||
|
|
@ -140,9 +170,13 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
// compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj)
|
// compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj)
|
||||||
|
|
||||||
double readLikelihood[] = new double[2];
|
double readLikelihood[] = new double[2];
|
||||||
readLikelihood[0]= -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read)/10.0;
|
double readLikeGivenH;
|
||||||
if (i != j)
|
readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read);
|
||||||
readLikelihood[1] = -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read)/10.0;
|
readLikelihood[0]= -readLikeGivenH/10.0;
|
||||||
|
if (i != j){
|
||||||
|
readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
|
||||||
|
readLikelihood[1] = -readLikeGivenH/10.0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
readLikelihood[1] = readLikelihood[0];
|
readLikelihood[1] = readLikelihood[0];
|
||||||
|
|
||||||
|
|
@ -150,6 +184,11 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
// sumlog10 computes 10^x[0]+10^x[1]+...
|
// sumlog10 computes 10^x[0]+10^x[1]+...
|
||||||
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
||||||
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
||||||
|
/*System.out.print(read.getReadName()+" ");
|
||||||
|
System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f %3.4f\n",i, j, read.getAlignmentStart(),
|
||||||
|
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||||
|
read.getCigarString(), probRGivenHPair, haplotypeLikehoodMatrix[i][j]);
|
||||||
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -111,7 +111,7 @@ public class Haplotype {
|
||||||
|
|
||||||
byte[] refBases = ref.getBases();
|
byte[] refBases = ref.getBases();
|
||||||
|
|
||||||
int numPrefBases = 5; // haplotypeSize/2;
|
int numPrefBases = haplotypeSize/2;
|
||||||
|
|
||||||
int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart());
|
int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart());
|
||||||
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
|
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue