First step in refactoring UG way of storing indel likelihoods - main motive is that rank sum annotations require per-read quality or likelihood information, and even the question "what allele of a variant is present in a read" which is trivial for SNPs may not be that straightforward for indels.
This step just changes storage of likelihoods so now we have, instead of an internal matrix, a class member which stores, as a hash table, a mapping from pileup element to an (allele, likelihood) pair. There's no functional change aside from internal data storage. As a bonus, we get for free a 2-3x improvement in speed in calling because redundant likelihood computations are removed. Next step will hook this up to, and redefine annotation engine interaction with UG for indel case. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3ccc08ace4
commit
5a7444e186
|
|
@ -34,10 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
|
@ -57,9 +59,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
||||||
private boolean DEBUG = false;
|
private boolean DEBUG = false;
|
||||||
|
|
||||||
private PairHMMIndelErrorModel pairModel;
|
private PairHMMIndelErrorModel pairModel;
|
||||||
// gdebug removeme
|
|
||||||
// todo -cleanup
|
private HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap;
|
||||||
private HaplotypeIndelErrorModel model;
|
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
|
||||||
|
|
||||||
|
// gdebug removeme
|
||||||
|
// todo -cleanup
|
||||||
|
private HaplotypeIndelErrorModel model;
|
||||||
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
|
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
|
||||||
//
|
//
|
||||||
private GenomeLoc lastSiteVisited;
|
private GenomeLoc lastSiteVisited;
|
||||||
|
|
@ -84,13 +90,18 @@ private HaplotypeIndelErrorModel model;
|
||||||
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
|
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
|
||||||
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
|
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
|
||||||
}
|
}
|
||||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
|
||||||
|
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
|
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
|
||||||
alleleList = new ArrayList<Allele>();
|
alleleList = new ArrayList<Allele>();
|
||||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||||
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
||||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||||
|
|
||||||
|
indelLikelihoodMap = new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||||
|
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -215,7 +226,7 @@ private HaplotypeIndelErrorModel model;
|
||||||
|
|
||||||
if (DEBUG) {
|
if (DEBUG) {
|
||||||
int icount = indelPileup.getNumberOfInsertions();
|
int icount = indelPileup.getNumberOfInsertions();
|
||||||
int dcount = indelPileup.getNumberOfDeletions();
|
int dcount = indelPileup.getNumberOfDeletions();
|
||||||
if (icount + dcount > 0)
|
if (icount + dcount > 0)
|
||||||
{
|
{
|
||||||
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
|
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
|
||||||
|
|
@ -294,7 +305,8 @@ private HaplotypeIndelErrorModel model;
|
||||||
// starting a new site: clear allele list
|
// starting a new site: clear allele list
|
||||||
alleleList.clear();
|
alleleList.clear();
|
||||||
lastSiteVisited = ref.getLocus().clone();
|
lastSiteVisited = ref.getLocus().clone();
|
||||||
|
indelLikelihoodMap.clear();
|
||||||
|
haplotypeMap.clear();
|
||||||
|
|
||||||
if (getAlleleListFromVCF) {
|
if (getAlleleListFromVCF) {
|
||||||
|
|
||||||
|
|
@ -341,7 +353,7 @@ private HaplotypeIndelErrorModel model;
|
||||||
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
|
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
|
||||||
// assume only one alt allele for now
|
// assume only one alt allele for now
|
||||||
|
|
||||||
List<Haplotype> haplotypesInVC;
|
//List<Haplotype> haplotypesInVC;
|
||||||
|
|
||||||
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
|
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
|
||||||
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
|
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
|
||||||
|
|
@ -354,7 +366,7 @@ private HaplotypeIndelErrorModel model;
|
||||||
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
|
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
|
||||||
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
|
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
|
||||||
|
|
||||||
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||||
ref, hsize, numPrefBases);
|
ref, hsize, numPrefBases);
|
||||||
|
|
||||||
// For each sample, get genotype likelihoods based on pileup
|
// For each sample, get genotype likelihoods based on pileup
|
||||||
|
|
@ -362,9 +374,6 @@ private HaplotypeIndelErrorModel model;
|
||||||
// initialize the GenotypeLikelihoods
|
// initialize the GenotypeLikelihoods
|
||||||
GLs.clear();
|
GLs.clear();
|
||||||
|
|
||||||
double[][] haplotypeLikehoodMatrix;
|
|
||||||
|
|
||||||
|
|
||||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||||
|
|
||||||
|
|
@ -375,15 +384,14 @@ private HaplotypeIndelErrorModel model;
|
||||||
pileup = context.getBasePileup();
|
pileup = context.getBasePileup();
|
||||||
|
|
||||||
if (pileup != null ) {
|
if (pileup != null ) {
|
||||||
|
double[] genotypeLikelihoods;
|
||||||
if (useOldWrongHorribleHackedUpLikelihoodModel)
|
if (useOldWrongHorribleHackedUpLikelihoodModel)
|
||||||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
|
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
|
||||||
else
|
else
|
||||||
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref, HAPLOTYPE_SIZE, eventLength);
|
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, HAPLOTYPE_SIZE, eventLength, indelLikelihoodMap);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
|
||||||
|
|
||||||
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
||||||
refAllele,
|
refAllele,
|
||||||
altAllele,
|
altAllele,
|
||||||
|
|
@ -398,4 +406,7 @@ private HaplotypeIndelErrorModel model;
|
||||||
|
|
||||||
return refAllele;
|
return refAllele;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
||||||
|
|
||||||
import net.sf.samtools.AlignmentBlock;
|
import net.sf.samtools.AlignmentBlock;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
|
@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class HaplotypeIndelErrorModel {
|
public class HaplotypeIndelErrorModel {
|
||||||
|
|
@ -419,7 +421,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC){
|
public double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, HashMap<Allele,Haplotype> haplotypesInVC){
|
||||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
||||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||||
int i=0;
|
int i=0;
|
||||||
|
|
@ -429,7 +431,8 @@ public class HaplotypeIndelErrorModel {
|
||||||
}
|
}
|
||||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
int j=0;
|
||||||
|
for (Allele a: haplotypesInVC.keySet()) {
|
||||||
readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
|
readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
|
||||||
if (DEBUG) {
|
if (DEBUG) {
|
||||||
System.out.print(read.getReadName()+" ");
|
System.out.print(read.getReadName()+" ");
|
||||||
|
|
@ -438,7 +441,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||||
read.getCigarString(), readLikelihoods[i][j]);
|
read.getCigarString(), readLikelihoods[i][j]);
|
||||||
}
|
}
|
||||||
|
j++;
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -465,11 +468,11 @@ public class HaplotypeIndelErrorModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return haplotypeLikehoodMatrix;
|
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
private double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||||
int hSize = haplotypeLikehoodMatrix.length;
|
int hSize = haplotypeLikehoodMatrix.length;
|
||||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
|
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
|
||||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
|
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
|
||||||
|
|
@ -41,6 +42,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
@ -48,9 +50,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -697,29 +697,31 @@ public class PairHMMIndelErrorModel {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fillGapProbabilities(int hIndex, int[] hrunProfile,
|
private void fillGapProbabilities(int[] hrunProfile,
|
||||||
double[][] contextLogGapOpenProbabilities, double[][] contextLogGapContinuationProbabilities) {
|
double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) {
|
||||||
// fill based on lookup table
|
// fill based on lookup table
|
||||||
for (int i = 0; i < hrunProfile.length; i++) {
|
for (int i = 0; i < hrunProfile.length; i++) {
|
||||||
if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) {
|
if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) {
|
||||||
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
||||||
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
|
contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
|
||||||
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
|
contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
|
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
|
||||||
ReferenceContext ref, int haplotypeSize, int eventLength){
|
ReferenceContext ref, int haplotypeSize, int eventLength,
|
||||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
|
||||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
|
||||||
|
int numHaplotypes = haplotypeMap.size();
|
||||||
|
double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||||
|
double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
|
||||||
int readIdx=0;
|
int readIdx=0;
|
||||||
|
|
||||||
double[][] contextLogGapOpenProbabilities = null;
|
LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
|
||||||
double[][] contextLogGapContinuationProbabilities = null;
|
LinkedHashMap<Allele,double[]> gapContProbabilityMap = new LinkedHashMap<Allele,double[]>();
|
||||||
|
|
||||||
|
|
||||||
if (DEBUG) {
|
if (DEBUG) {
|
||||||
System.out.println("Reference bases:");
|
System.out.println("Reference bases:");
|
||||||
|
|
@ -727,15 +729,15 @@ public class PairHMMIndelErrorModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
|
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
|
||||||
// will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes.
|
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
|
||||||
|
|
||||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
|
||||||
Haplotype haplotype = haplotypesInVC.get(j);
|
for (Allele a: haplotypeMap.keySet()) {
|
||||||
|
Haplotype haplotype = haplotypeMap.get(a);
|
||||||
byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
||||||
if (contextLogGapOpenProbabilities == null) {
|
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
|
||||||
contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
|
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
|
||||||
contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
|
|
||||||
}
|
|
||||||
// get homopolymer length profile for current haplotype
|
// get homopolymer length profile for current haplotype
|
||||||
int[] hrunProfile = new int[haplotypeBases.length];
|
int[] hrunProfile = new int[haplotypeBases.length];
|
||||||
getContextHomopolymerLength(haplotypeBases,hrunProfile);
|
getContextHomopolymerLength(haplotypeBases,hrunProfile);
|
||||||
|
|
@ -746,239 +748,261 @@ public class PairHMMIndelErrorModel {
|
||||||
System.out.format("%d",hrunProfile[i]);
|
System.out.format("%d",hrunProfile[i]);
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
fillGapProbabilities(j, hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||||
|
|
||||||
|
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
|
||||||
|
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (SAMRecord pread : pileup.getReads()) {
|
for (PileupElement p: pileup) {
|
||||||
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(pread);
|
|
||||||
if (read == null)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
|
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
|
||||||
continue;
|
if (indelLikelihoodMap.containsKey(p)) {
|
||||||
}
|
HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
|
||||||
|
int j=0;
|
||||||
double[] recalQuals = null;
|
for (Allele a: haplotypeMap.keySet()) {
|
||||||
if (getGapPenaltiesFromFile) {
|
readLikelihoods[readIdx][j++] = el.get(a);
|
||||||
RecalDataManager.parseSAMRecord( read, RAC );
|
|
||||||
|
|
||||||
|
|
||||||
recalQuals = new double[read.getReadLength()];
|
|
||||||
|
|
||||||
//compute all covariate values for this read
|
|
||||||
final Comparable[][] covariateValues_offset_x_covar =
|
|
||||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
|
||||||
// For each base in the read
|
|
||||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
|
||||||
|
|
||||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
|
||||||
|
|
||||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
|
||||||
if(qualityScore == null)
|
|
||||||
{
|
|
||||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
|
||||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
|
||||||
}
|
|
||||||
|
|
||||||
recalQuals[offset] = -((double)qualityScore)/10.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
|
||||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
|
||||||
if (DEBUG) {
|
|
||||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
|
||||||
read.getAlignmentStart(),
|
|
||||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
|
||||||
read.getCigarString());
|
|
||||||
|
|
||||||
byte[] bases = read.getReadBases();
|
|
||||||
for (int k = 0; k < recalQuals.length; k++) {
|
|
||||||
System.out.format("%c",bases[k]);
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
|
|
||||||
for (int k = 0; k < recalQuals.length; k++) {
|
|
||||||
System.out.format("%.0f ",recalQuals[k]);
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// get bases of candidate haplotypes that overlap with reads
|
|
||||||
final int trailingBases = 3;
|
|
||||||
|
|
||||||
long readStart = read.getUnclippedStart();
|
|
||||||
long readEnd = read.getUnclippedEnd();
|
|
||||||
|
|
||||||
int numStartSoftClippedBases, numEndSoftClippedBases;
|
|
||||||
|
|
||||||
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
|
|
||||||
// but they're actually consistent with the insertion!
|
|
||||||
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
|
|
||||||
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
|
|
||||||
long eventStartPos = ref.getLocus().getStart();
|
|
||||||
|
|
||||||
// compute total number of clipped bases (soft or hard clipped)
|
|
||||||
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
|
|
||||||
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
|
|
||||||
|
|
||||||
// check for hard clips (never consider these bases):
|
|
||||||
/* Cigar c = read.getCigar();
|
|
||||||
CigarElement first = c.getCigarElement(0);
|
|
||||||
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
|
||||||
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
|
|
||||||
|
|
||||||
if (first.getOperator() == CigarOperator.H) {
|
|
||||||
numStartHardClippedBases = first.getLength();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (last.getOperator() == CigarOperator.H) {
|
|
||||||
numEndHardClippedBases = last.getLength();
|
|
||||||
}
|
|
||||||
|
|
||||||
// correct for hard clips
|
|
||||||
numStartSoftClippedBases -= numStartHardClippedBases;
|
|
||||||
numEndSoftClippedBases -= numEndHardClippedBases;
|
|
||||||
readStart += numStartHardClippedBases;
|
|
||||||
readEnd -= numEndHardClippedBases;
|
|
||||||
*/
|
|
||||||
// remove soft clips if necessary
|
|
||||||
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
|
|
||||||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
|
||||||
numStartSoftClippedBases = 0;
|
|
||||||
numEndSoftClippedBases = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
byte[] unclippedReadBases, unclippedReadQuals;
|
|
||||||
|
|
||||||
int numStartClippedBases = numStartSoftClippedBases;
|
|
||||||
int numEndClippedBases = numEndSoftClippedBases;
|
|
||||||
unclippedReadBases = read.getReadBases();
|
|
||||||
unclippedReadQuals = read.getBaseQualities();
|
|
||||||
|
|
||||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
|
||||||
// and may leave a string of Q2 bases still hanging off the reads.
|
|
||||||
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
|
|
||||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
|
||||||
numStartClippedBases++;
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
|
|
||||||
}
|
|
||||||
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
|
|
||||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
|
||||||
numEndClippedBases++;
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
int extraOffset = Math.abs(eventLength);
|
|
||||||
|
|
||||||
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
|
||||||
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
|
||||||
|
|
||||||
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
|
|
||||||
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
|
|
||||||
// check if start of read will be before start of reference context
|
|
||||||
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
|
|
||||||
start = ref.getWindow().getStart();
|
|
||||||
|
|
||||||
// check also if end of read will go beyond reference context
|
|
||||||
if (stop > ref.getWindow().getStop())
|
|
||||||
stop = ref.getWindow().getStop();
|
|
||||||
|
|
||||||
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
|
|
||||||
// but we want to compute likelihoods in the whole region that a read might overlap
|
|
||||||
if (stop <= start + readLength) {
|
|
||||||
stop = start + readLength-1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ok, we now figured out total number of clipped bases on both ends.
|
|
||||||
// Figure out where we want to place the haplotype to score read against
|
|
||||||
if (DEBUG)
|
|
||||||
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
|
||||||
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
|
|
||||||
if (DEBUG)
|
|
||||||
System.out.println("BAD READ!!");
|
|
||||||
|
|
||||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
|
||||||
readLikelihoods[readIdx][j]= 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
|
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
|
||||||
unclippedReadBases.length-numEndClippedBases);
|
if (read == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
double[] recalQuals = null;
|
||||||
|
|
||||||
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
|
||||||
unclippedReadBases.length-numEndClippedBases);
|
|
||||||
|
|
||||||
double[] recalCDP = null;
|
|
||||||
if (getGapPenaltiesFromFile) {
|
if (getGapPenaltiesFromFile) {
|
||||||
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
|
RecalDataManager.parseSAMRecord( read, RAC );
|
||||||
|
|
||||||
|
|
||||||
|
recalQuals = new double[read.getReadLength()];
|
||||||
|
|
||||||
|
//compute all covariate values for this read
|
||||||
|
final Comparable[][] covariateValues_offset_x_covar =
|
||||||
|
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||||
|
// For each base in the read
|
||||||
|
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||||
|
|
||||||
|
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||||
|
|
||||||
|
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||||
|
if(qualityScore == null)
|
||||||
|
{
|
||||||
|
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||||
|
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
recalQuals[offset] = -((double)qualityScore)/10.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||||
|
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||||
|
read.getAlignmentStart(),
|
||||||
|
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||||
|
read.getCigarString());
|
||||||
|
|
||||||
|
byte[] bases = read.getReadBases();
|
||||||
|
for (int k = 0; k < recalQuals.length; k++) {
|
||||||
|
System.out.format("%c",bases[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
for (int k = 0; k < recalQuals.length; k++) {
|
||||||
|
System.out.format("%.0f ",recalQuals[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// get bases of candidate haplotypes that overlap with reads
|
||||||
|
final int trailingBases = 3;
|
||||||
|
|
||||||
|
long readStart = read.getUnclippedStart();
|
||||||
|
long readEnd = read.getUnclippedEnd();
|
||||||
|
|
||||||
|
int numStartSoftClippedBases, numEndSoftClippedBases;
|
||||||
|
|
||||||
|
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
|
||||||
|
// but they're actually consistent with the insertion!
|
||||||
|
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
|
||||||
|
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
|
||||||
|
long eventStartPos = ref.getLocus().getStart();
|
||||||
|
|
||||||
|
// compute total number of clipped bases (soft or hard clipped)
|
||||||
|
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
|
||||||
|
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
|
||||||
|
|
||||||
|
// check for hard clips (never consider these bases):
|
||||||
|
/* Cigar c = read.getCigar();
|
||||||
|
CigarElement first = c.getCigarElement(0);
|
||||||
|
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
||||||
|
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
|
||||||
|
|
||||||
|
if (first.getOperator() == CigarOperator.H) {
|
||||||
|
numStartHardClippedBases = first.getLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last.getOperator() == CigarOperator.H) {
|
||||||
|
numEndHardClippedBases = last.getLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
// correct for hard clips
|
||||||
|
numStartSoftClippedBases -= numStartHardClippedBases;
|
||||||
|
numEndSoftClippedBases -= numEndHardClippedBases;
|
||||||
|
readStart += numStartHardClippedBases;
|
||||||
|
readEnd -= numEndHardClippedBases;
|
||||||
|
*/
|
||||||
|
// remove soft clips if necessary
|
||||||
|
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
|
||||||
|
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
||||||
|
numStartSoftClippedBases = 0;
|
||||||
|
numEndSoftClippedBases = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
byte[] unclippedReadBases, unclippedReadQuals;
|
||||||
|
|
||||||
|
int numStartClippedBases = numStartSoftClippedBases;
|
||||||
|
int numEndClippedBases = numEndSoftClippedBases;
|
||||||
|
unclippedReadBases = read.getReadBases();
|
||||||
|
unclippedReadQuals = read.getBaseQualities();
|
||||||
|
|
||||||
|
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||||
|
// and may leave a string of Q2 bases still hanging off the reads.
|
||||||
|
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
|
||||||
|
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||||
|
numStartClippedBases++;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
|
||||||
|
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||||
|
numEndClippedBases++;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int extraOffset = Math.abs(eventLength);
|
||||||
|
|
||||||
|
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
||||||
|
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
||||||
|
|
||||||
|
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
|
||||||
|
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
|
||||||
|
// check if start of read will be before start of reference context
|
||||||
|
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
|
||||||
|
start = ref.getWindow().getStart();
|
||||||
|
|
||||||
|
// check also if end of read will go beyond reference context
|
||||||
|
if (stop > ref.getWindow().getStop())
|
||||||
|
stop = ref.getWindow().getStop();
|
||||||
|
|
||||||
|
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
|
||||||
|
// but we want to compute likelihoods in the whole region that a read might overlap
|
||||||
|
if (stop <= start + readLength) {
|
||||||
|
stop = start + readLength-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ok, we now figured out total number of clipped bases on both ends.
|
||||||
|
// Figure out where we want to place the haplotype to score read against
|
||||||
|
if (DEBUG)
|
||||||
|
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
||||||
|
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
|
||||||
|
|
||||||
|
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
|
||||||
|
if (DEBUG)
|
||||||
|
System.out.println("BAD READ!!");
|
||||||
|
|
||||||
|
int j=0;
|
||||||
|
for (Allele a: haplotypeMap.keySet()) {
|
||||||
|
readEl.put(a,0.0);
|
||||||
|
readLikelihoods[readIdx][j++] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
|
||||||
unclippedReadBases.length-numEndClippedBases);
|
unclippedReadBases.length-numEndClippedBases);
|
||||||
|
|
||||||
}
|
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
||||||
|
unclippedReadBases.length-numEndClippedBases);
|
||||||
|
|
||||||
if (DEBUG) {
|
double[] recalCDP = null;
|
||||||
System.out.println("Read bases:");
|
if (getGapPenaltiesFromFile) {
|
||||||
System.out.println(new String(readBases));
|
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
|
||||||
}
|
unclippedReadBases.length-numEndClippedBases);
|
||||||
|
|
||||||
|
}
|
||||||
// start and stop have indices into
|
|
||||||
|
|
||||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
|
||||||
Haplotype haplotype = haplotypesInVC.get(j);
|
|
||||||
|
|
||||||
if (stop > haplotype.getStopPosition())
|
|
||||||
stop = haplotype.getStopPosition();
|
|
||||||
|
|
||||||
if (start < haplotype.getStartPosition())
|
|
||||||
start = haplotype.getStartPosition();
|
|
||||||
|
|
||||||
// cut haplotype bases
|
|
||||||
long indStart = start - haplotype.getStartPosition();
|
|
||||||
long indStop = stop - haplotype.getStartPosition();
|
|
||||||
|
|
||||||
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
|
|
||||||
(int)indStart, (int)indStop);
|
|
||||||
|
|
||||||
if (DEBUG) {
|
if (DEBUG) {
|
||||||
System.out.println("Haplotype to test:");
|
System.out.println("Read bases:");
|
||||||
System.out.println(new String(haplotypeBases));
|
System.out.println(new String(readBases));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (useAffineGapModel) {
|
int j=0;
|
||||||
|
for (Allele a: haplotypeMap.keySet()) {
|
||||||
|
|
||||||
double[] currentContextGOP = null;
|
|
||||||
double[] currentContextGCP = null;
|
|
||||||
|
|
||||||
if (doContextDependentPenalties) {
|
Haplotype haplotype = haplotypeMap.get(a);
|
||||||
|
if (stop > haplotype.getStopPosition())
|
||||||
|
stop = haplotype.getStopPosition();
|
||||||
|
|
||||||
if (getGapPenaltiesFromFile) {
|
if (start < haplotype.getStartPosition())
|
||||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
|
start = haplotype.getStartPosition();
|
||||||
|
|
||||||
} else {
|
// cut haplotype bases
|
||||||
currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
|
long indStart = start - haplotype.getStartPosition();
|
||||||
currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
|
long indStop = stop - haplotype.getStartPosition();
|
||||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
|
||||||
}
|
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
|
||||||
|
(int)indStart, (int)indStop);
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("Haplotype to test:");
|
||||||
|
System.out.println(new String(haplotypeBases));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Double readLikelihood = 0.0;
|
||||||
|
if (useAffineGapModel) {
|
||||||
|
|
||||||
|
double[] currentContextGOP = null;
|
||||||
|
double[] currentContextGCP = null;
|
||||||
|
|
||||||
|
if (doContextDependentPenalties) {
|
||||||
|
|
||||||
|
if (getGapPenaltiesFromFile) {
|
||||||
|
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||||
|
currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||||
|
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
||||||
|
|
||||||
|
readEl.put(a,readLikelihood);
|
||||||
|
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
indelLikelihoodMap.put(p,readEl);
|
||||||
}
|
}
|
||||||
|
|
||||||
readIdx++;
|
readIdx++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -992,8 +1016,8 @@ public class PairHMMIndelErrorModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
for (int i=0; i < haplotypesInVC.size(); i++) {
|
for (int i=0; i < numHaplotypes; i++) {
|
||||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
for (int j=i; j < numHaplotypes; j++){
|
||||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||||
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||||
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||||
|
|
@ -1002,7 +1026,7 @@ public class PairHMMIndelErrorModel {
|
||||||
|
|
||||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||||
// First term is approximated by Jacobian log with table lookup.
|
// First term is approximated by Jacobian log with table lookup.
|
||||||
if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j]))
|
if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
|
||||||
continue;
|
continue;
|
||||||
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
|
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
|
||||||
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
|
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
|
||||||
|
|
@ -1013,7 +1037,7 @@ public class PairHMMIndelErrorModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return haplotypeLikehoodMatrix;
|
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,9 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class Haplotype {
|
public class Haplotype {
|
||||||
protected byte[] bases = null;
|
protected byte[] bases = null;
|
||||||
|
|
@ -108,11 +106,11 @@ public class Haplotype {
|
||||||
return isReference;
|
return isReference;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
|
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
|
||||||
final int haplotypeSize, final int numPrefBases) {
|
final int haplotypeSize, final int numPrefBases) {
|
||||||
|
|
||||||
|
|
||||||
List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
|
LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||||
|
|
||||||
Allele refAllele = null;
|
Allele refAllele = null;
|
||||||
|
|
||||||
|
|
@ -153,11 +151,11 @@ public class Haplotype {
|
||||||
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
|
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
|
||||||
haplotypeString = haplotypeString.substring(0,haplotypeSize);
|
haplotypeString = haplotypeString.substring(0,haplotypeSize);
|
||||||
|
|
||||||
haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
|
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return haplotypeList;
|
return haplotypeMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue