Several improvements to new indel genotyper (more to come soon):
a) Turns out previous change of centering haplotype around indel was a bad idea. Context to the left of indel is important but not as important as right one, because by definition all alleles start at the same location, so haplotype is the same to the left of indel regardless of allele. So, go back to having a constant size window to the left of event. b) Expand reference context so we can test larger haplotypes. c) Optimize computation of read likelihoods by doing them in linear array instead of in a matrix - no difference in biallelic sites but could be significantly faster in multiallelic sites. d) Bug fix: read alignment wasn't being computed correctly if, a) we were at an insertion, b) read started right at the insertion, c) read CIGAR didn't include insertion - more of these corner conditions are lurking, so a revamped computation of how reads align to candidate haplotypes is in the works. e) Add debug option not to use prior haplotype likelihoods. f) Don't hard-code NA12878 for genotyping, now sample name is a required input argument. g) Bug fix: if there are no reads covering a candidate indel event, just output NO_CALL (didn't notice this in HiSeq, but in P1 data it happens all the time). I need to add a confidence threshold for calling later on. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4291 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fb6d7d19f9
commit
c604ed9440
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
||||||
|
|
||||||
import net.sf.samtools.AlignmentBlock;
|
import net.sf.samtools.AlignmentBlock;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
@ -124,7 +125,8 @@ public class HaplotypeIndelErrorModel {
|
||||||
return -10.0*Math.log10(prob);
|
return -10.0*Math.log10(prob);
|
||||||
}
|
}
|
||||||
|
|
||||||
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {
|
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read,
|
||||||
|
VariantContext vc, int eventLength) {
|
||||||
long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart();
|
long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart();
|
||||||
long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd();
|
long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd();
|
||||||
final long readStartPosition = read.getAlignmentStart();
|
final long readStartPosition = read.getAlignmentStart();
|
||||||
|
|
@ -154,6 +156,11 @@ public class HaplotypeIndelErrorModel {
|
||||||
initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition);
|
initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// special case 1: we're analyzing an insertion and the read starts right at the insertion.
|
||||||
|
if ((readStartPosition > vc.getStart()-eventLength) && vc.isInsertion() && !haplotype.isReference()) {
|
||||||
|
initialIndexInHaplotype += eventLength;
|
||||||
|
}
|
||||||
|
|
||||||
if (DEBUG) {
|
if (DEBUG) {
|
||||||
System.out.println("READ: "+read.getReadName());
|
System.out.println("READ: "+read.getReadName());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||||
|
|
||||||
|
|
||||||
@Reference(window=@Window(start=-100,stop=100))
|
@Reference(window=@Window(start=-30,stop=200))
|
||||||
@Allows({DataSource.READS, DataSource.REFERENCE})
|
@Allows({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
@Output
|
@Output
|
||||||
|
|
@ -76,12 +76,21 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
@Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false)
|
@Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false)
|
||||||
double indelHeterozygosity = (double)1.0/6500.0;
|
double indelHeterozygosity = (double)1.0/6500.0;
|
||||||
|
|
||||||
|
@Argument(fullName="sampleName",shortName="sample",doc="Sample name to evaluate genotypes in",required=true)
|
||||||
|
String sampleName;
|
||||||
|
|
||||||
@Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false)
|
@Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false)
|
||||||
boolean doSimple = false;
|
boolean doSimple = false;
|
||||||
|
|
||||||
@Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false)
|
@Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false)
|
||||||
boolean DEBUG = false;
|
boolean DEBUG = false;
|
||||||
|
|
||||||
|
@Argument(fullName="doTrio",shortName="trio",doc="Output 1KG CEU trio genotype data (sampleName ignored)",required=false)
|
||||||
|
boolean doTrio = false;
|
||||||
|
|
||||||
|
@Argument(fullName="useFlatPriors",shortName="flat",doc="If present, use flat priors on haplotypes",required=false)
|
||||||
|
boolean useFlatPriors = false;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean generateExtendedEvents() { return true; }
|
public boolean generateExtendedEvents() { return true; }
|
||||||
|
|
||||||
|
|
@ -155,40 +164,51 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
if (eventLength<0)
|
if (eventLength<0)
|
||||||
eventLength = - eventLength;
|
eventLength = - eventLength;
|
||||||
|
|
||||||
|
if (!useFlatPriors) {
|
||||||
|
|
||||||
double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity;
|
double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity;
|
||||||
double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
|
double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
|
||||||
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
|
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
|
||||||
|
|
||||||
haplotypeLikehoodMatrix[0][1] = altPrior;
|
haplotypeLikehoodMatrix[0][1] = altPrior;
|
||||||
haplotypeLikehoodMatrix[1][1] = 2*altPrior;
|
haplotypeLikehoodMatrix[1][1] = 2*altPrior;
|
||||||
|
}
|
||||||
|
|
||||||
int bestIndexI =0, bestIndexJ=0;
|
int bestIndexI =-1, bestIndexJ=-1;
|
||||||
for (int i=0; i < haplotypesInVC.size(); i++) {
|
double callConfidence = 0.0;
|
||||||
|
|
||||||
|
if (pileup.getReads().size() > 0) {
|
||||||
|
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||||
|
int i=0;
|
||||||
|
for (SAMRecord read : pileup.getReads()) {
|
||||||
|
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||||
|
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||||
|
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||||
|
readLikelihoods[i][j]= model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength);
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.print(read.getReadName()+" ");
|
||||||
|
|
||||||
|
System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",i, j, read.getAlignmentStart(),
|
||||||
|
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||||
|
read.getCigarString(), readLikelihoods[i][j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i=0; i < haplotypesInVC.size(); i++) {
|
||||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
for (int j=i; j < haplotypesInVC.size(); j++){
|
||||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||||
for (SAMRecord read : pileup.getReads()) {
|
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||||
// compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj)
|
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||||
|
double[] readLikelihood = new double[2]; // diploid sample
|
||||||
|
for (int readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||||
|
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
|
||||||
|
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
|
||||||
|
|
||||||
double readLikelihood[] = new double[2];
|
|
||||||
double readLikeGivenH;
|
|
||||||
readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read);
|
|
||||||
readLikelihood[0]= -readLikeGivenH/10.0;
|
|
||||||
if (i != j){
|
|
||||||
readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
|
|
||||||
readLikelihood[1] = -readLikeGivenH/10.0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
readLikelihood[1] = readLikelihood[0];
|
|
||||||
|
|
||||||
// likelihood is by convention -10*log10(pr(r|h))
|
|
||||||
// sumlog10 computes 10^x[0]+10^x[1]+...
|
|
||||||
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
||||||
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
||||||
/*System.out.print(read.getReadName()+" ");
|
|
||||||
System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f %3.4f\n",i, j, read.getAlignmentStart(),
|
|
||||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
|
||||||
read.getCigarString(), probRGivenHPair, haplotypeLikehoodMatrix[i][j]);
|
|
||||||
*/
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -196,10 +216,12 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) {
|
if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) {
|
||||||
bestIndexI = i;
|
bestIndexI = i;
|
||||||
bestIndexJ = j;
|
bestIndexJ = j;
|
||||||
|
callConfidence = bestLikelihood - haplotypeLikehoodMatrix[i][j];
|
||||||
bestLikelihood = haplotypeLikehoodMatrix[i][j];
|
bestLikelihood = haplotypeLikehoodMatrix[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//we say that most likely genotype at site is index (i,j) maximizing likelihood matrix
|
//we say that most likely genotype at site is index (i,j) maximizing likelihood matrix
|
||||||
String type;
|
String type;
|
||||||
|
|
@ -215,17 +237,15 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
|
|
||||||
out.format("%s %d %s ",vc.getChr(), vc.getStart(), type);
|
out.format("%s %d %s ",vc.getChr(), vc.getStart(), type);
|
||||||
|
|
||||||
|
if (doTrio) {
|
||||||
Genotype originalGenotype = vc.getGenotype("NA12878");
|
Genotype originalGenotype = vc.getGenotype("NA12878");
|
||||||
|
Genotype dadGenotype = vc.getGenotype("NA12891");
|
||||||
|
Genotype momGenotype = vc.getGenotype("NA12892");
|
||||||
|
|
||||||
String oldG, newG;
|
String dadG, momG, oldG, newG;
|
||||||
if (originalGenotype.isHomRef())
|
oldG = getGenotypeString(originalGenotype);
|
||||||
oldG = "HOMREF";
|
dadG = getGenotypeString(dadGenotype);
|
||||||
else if (originalGenotype.isHet())
|
momG = getGenotypeString(momGenotype);
|
||||||
oldG = "HET";
|
|
||||||
else if (originalGenotype.isHomVar())
|
|
||||||
oldG = "HOMVAR";
|
|
||||||
else
|
|
||||||
oldG = "OTHER";
|
|
||||||
|
|
||||||
int x = bestIndexI+bestIndexJ;
|
int x = bestIndexI+bestIndexJ;
|
||||||
if (x == 0)
|
if (x == 0)
|
||||||
|
|
@ -234,11 +254,32 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
newG = "HET";
|
newG = "HET";
|
||||||
else if (x == 2)
|
else if (x == 2)
|
||||||
newG = "HOMVAR";
|
newG = "HOMVAR";
|
||||||
|
else if (x < 0)
|
||||||
|
newG = "NOCALL";
|
||||||
|
else
|
||||||
|
newG = "OTHER";
|
||||||
|
|
||||||
|
out.format("NewG %s OldG %s DadG %s MomG %s\n", newG, oldG, dadG, momG);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Genotype originalGenotype = vc.getGenotype(sampleName);
|
||||||
|
String oldG, newG;
|
||||||
|
oldG = getGenotypeString(originalGenotype);
|
||||||
|
int x = bestIndexI+bestIndexJ;
|
||||||
|
if (x == 0)
|
||||||
|
newG = "HOMREF";
|
||||||
|
else if (x == 1)
|
||||||
|
newG = "HET";
|
||||||
|
else if (x == 2)
|
||||||
|
newG = "HOMVAR";
|
||||||
|
else if (x < 0)
|
||||||
|
newG = "NOCALL";
|
||||||
else
|
else
|
||||||
newG = "OTHER";
|
newG = "OTHER";
|
||||||
|
|
||||||
out.format("NewG %s OldG %s\n", newG, oldG);
|
out.format("NewG %s OldG %s\n", newG, oldG);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
if ( context.hasExtendedEventPileup() ) {
|
if ( context.hasExtendedEventPileup() ) {
|
||||||
|
|
@ -272,6 +313,19 @@ public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||||
return 1; //To change body of implemented methods use File | Settings | File Templates.
|
return 1; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getGenotypeString(Genotype genotype) {
|
||||||
|
String oldG;
|
||||||
|
if (genotype.isHomRef())
|
||||||
|
oldG = "HOMREF";
|
||||||
|
else if (genotype.isHet())
|
||||||
|
oldG = "HET";
|
||||||
|
else if (genotype.isHomVar())
|
||||||
|
oldG = "HOMVAR";
|
||||||
|
else
|
||||||
|
oldG = "OTHER";
|
||||||
|
|
||||||
|
return oldG;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Provide an initial value for reduce computations.
|
* Provide an initial value for reduce computations.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -111,7 +111,7 @@ public class Haplotype {
|
||||||
|
|
||||||
byte[] refBases = ref.getBases();
|
byte[] refBases = ref.getBases();
|
||||||
|
|
||||||
int numPrefBases = haplotypeSize/2;
|
int numPrefBases = 20;
|
||||||
|
|
||||||
int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart());
|
int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart());
|
||||||
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
|
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue