First step in refactoring UG way of storing indel likelihoods - main motive is that rank sum annotations require per-read quality or likelihood information, and even the question "what allele of a variant is present in a read" which is trivial for SNPs may not be that straightforward for indels.
This step just changes storage of likelihoods so now we have, instead of an internal matrix, a class member which stores, as a hash table, a mapping from pileup element to an (allele, likelihood) pair. There's no functional change aside from internal data storage. As a bonus, we get for free a 2-3x improvement in speed in calling because redundant likelihood computations are removed. Next step will hook this up to, and redefine annotation engine interaction with UG for indel case. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3ccc08ace4
commit
5a7444e186
|
|
@ -34,10 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
|||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -57,9 +59,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private boolean DEBUG = false;
|
||||
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
// gdebug removeme
|
||||
// todo -cleanup
|
||||
private HaplotypeIndelErrorModel model;
|
||||
|
||||
private HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap;
|
||||
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
|
||||
|
||||
// gdebug removeme
|
||||
// todo -cleanup
|
||||
private HaplotypeIndelErrorModel model;
|
||||
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
|
||||
//
|
||||
private GenomeLoc lastSiteVisited;
|
||||
|
|
@ -84,13 +90,18 @@ private HaplotypeIndelErrorModel model;
|
|||
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
|
||||
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
|
||||
}
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
|
||||
alleleList = new ArrayList<Allele>();
|
||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
|
||||
indelLikelihoodMap = new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -215,7 +226,7 @@ private HaplotypeIndelErrorModel model;
|
|||
|
||||
if (DEBUG) {
|
||||
int icount = indelPileup.getNumberOfInsertions();
|
||||
int dcount = indelPileup.getNumberOfDeletions();
|
||||
int dcount = indelPileup.getNumberOfDeletions();
|
||||
if (icount + dcount > 0)
|
||||
{
|
||||
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
|
||||
|
|
@ -294,7 +305,8 @@ private HaplotypeIndelErrorModel model;
|
|||
// starting a new site: clear allele list
|
||||
alleleList.clear();
|
||||
lastSiteVisited = ref.getLocus().clone();
|
||||
|
||||
indelLikelihoodMap.clear();
|
||||
haplotypeMap.clear();
|
||||
|
||||
if (getAlleleListFromVCF) {
|
||||
|
||||
|
|
@ -341,7 +353,7 @@ private HaplotypeIndelErrorModel model;
|
|||
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
|
||||
// assume only one alt allele for now
|
||||
|
||||
List<Haplotype> haplotypesInVC;
|
||||
//List<Haplotype> haplotypesInVC;
|
||||
|
||||
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
|
||||
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
|
||||
|
|
@ -354,7 +366,7 @@ private HaplotypeIndelErrorModel model;
|
|||
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
|
||||
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
|
||||
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||
ref, hsize, numPrefBases);
|
||||
|
||||
// For each sample, get genotype likelihoods based on pileup
|
||||
|
|
@ -362,9 +374,6 @@ private HaplotypeIndelErrorModel model;
|
|||
// initialize the GenotypeLikelihoods
|
||||
GLs.clear();
|
||||
|
||||
double[][] haplotypeLikehoodMatrix;
|
||||
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
||||
|
|
@ -375,15 +384,14 @@ private HaplotypeIndelErrorModel model;
|
|||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null ) {
|
||||
double[] genotypeLikelihoods;
|
||||
if (useOldWrongHorribleHackedUpLikelihoodModel)
|
||||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
|
||||
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
|
||||
else
|
||||
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref, HAPLOTYPE_SIZE, eventLength);
|
||||
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, HAPLOTYPE_SIZE, eventLength, indelLikelihoodMap);
|
||||
|
||||
|
||||
|
||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
||||
|
||||
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
refAllele,
|
||||
altAllele,
|
||||
|
|
@ -398,4 +406,7 @@ private HaplotypeIndelErrorModel model;
|
|||
|
||||
return refAllele;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
|
||||
import net.sf.samtools.AlignmentBlock;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
public class HaplotypeIndelErrorModel {
|
||||
|
|
@ -419,7 +421,7 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC){
|
||||
public double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, HashMap<Allele,Haplotype> haplotypesInVC){
|
||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||
int i=0;
|
||||
|
|
@ -429,7 +431,8 @@ public class HaplotypeIndelErrorModel {
|
|||
}
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
int j=0;
|
||||
for (Allele a: haplotypesInVC.keySet()) {
|
||||
readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
|
||||
if (DEBUG) {
|
||||
System.out.print(read.getReadName()+" ");
|
||||
|
|
@ -438,7 +441,7 @@ public class HaplotypeIndelErrorModel {
|
|||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString(), readLikelihoods[i][j]);
|
||||
}
|
||||
|
||||
j++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
|
@ -465,11 +468,11 @@ public class HaplotypeIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
return haplotypeLikehoodMatrix;
|
||||
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
|
||||
|
||||
}
|
||||
|
||||
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
private double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
int hSize = haplotypeLikehoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
|
||||
|
|
@ -41,6 +42,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
|||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -48,9 +50,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
|
|
@ -697,29 +697,31 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
private void fillGapProbabilities(int hIndex, int[] hrunProfile,
|
||||
double[][] contextLogGapOpenProbabilities, double[][] contextLogGapContinuationProbabilities) {
|
||||
private void fillGapProbabilities(int[] hrunProfile,
|
||||
double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) {
|
||||
// fill based on lookup table
|
||||
for (int i = 0; i < hrunProfile.length; i++) {
|
||||
if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) {
|
||||
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
||||
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
||||
contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
||||
contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
|
||||
}
|
||||
else {
|
||||
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
|
||||
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
|
||||
contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
|
||||
contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
|
||||
}
|
||||
}
|
||||
}
|
||||
public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
|
||||
ReferenceContext ref, int haplotypeSize, int eventLength){
|
||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
|
||||
ReferenceContext ref, int haplotypeSize, int eventLength,
|
||||
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
|
||||
|
||||
int numHaplotypes = haplotypeMap.size();
|
||||
double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
|
||||
int readIdx=0;
|
||||
|
||||
double[][] contextLogGapOpenProbabilities = null;
|
||||
double[][] contextLogGapContinuationProbabilities = null;
|
||||
|
||||
LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
|
||||
LinkedHashMap<Allele,double[]> gapContProbabilityMap = new LinkedHashMap<Allele,double[]>();
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Reference bases:");
|
||||
|
|
@ -727,15 +729,15 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
|
||||
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
|
||||
// will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes.
|
||||
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
|
||||
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
Haplotype haplotype = haplotypesInVC.get(j);
|
||||
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
||||
if (contextLogGapOpenProbabilities == null) {
|
||||
contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
|
||||
contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
|
||||
}
|
||||
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
|
||||
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
|
||||
|
||||
// get homopolymer length profile for current haplotype
|
||||
int[] hrunProfile = new int[haplotypeBases.length];
|
||||
getContextHomopolymerLength(haplotypeBases,hrunProfile);
|
||||
|
|
@ -746,239 +748,261 @@ public class PairHMMIndelErrorModel {
|
|||
System.out.format("%d",hrunProfile[i]);
|
||||
System.out.println();
|
||||
}
|
||||
fillGapProbabilities(j, hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
|
||||
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
|
||||
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
|
||||
|
||||
}
|
||||
}
|
||||
for (SAMRecord pread : pileup.getReads()) {
|
||||
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(pread);
|
||||
if (read == null)
|
||||
continue;
|
||||
for (PileupElement p: pileup) {
|
||||
|
||||
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double[] recalQuals = null;
|
||||
if (getGapPenaltiesFromFile) {
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
|
||||
|
||||
recalQuals = new double[read.getReadLength()];
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = -((double)qualityScore)/10.0;
|
||||
}
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
if (DEBUG) {
|
||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||
read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString());
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%c",bases[k]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%.0f ",recalQuals[k]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
final int trailingBases = 3;
|
||||
|
||||
long readStart = read.getUnclippedStart();
|
||||
long readEnd = read.getUnclippedEnd();
|
||||
|
||||
int numStartSoftClippedBases, numEndSoftClippedBases;
|
||||
|
||||
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
|
||||
// but they're actually consistent with the insertion!
|
||||
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
|
||||
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
|
||||
long eventStartPos = ref.getLocus().getStart();
|
||||
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
|
||||
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
|
||||
|
||||
// check for hard clips (never consider these bases):
|
||||
/* Cigar c = read.getCigar();
|
||||
CigarElement first = c.getCigarElement(0);
|
||||
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
||||
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
|
||||
|
||||
if (first.getOperator() == CigarOperator.H) {
|
||||
numStartHardClippedBases = first.getLength();
|
||||
}
|
||||
|
||||
if (last.getOperator() == CigarOperator.H) {
|
||||
numEndHardClippedBases = last.getLength();
|
||||
}
|
||||
|
||||
// correct for hard clips
|
||||
numStartSoftClippedBases -= numStartHardClippedBases;
|
||||
numEndSoftClippedBases -= numEndHardClippedBases;
|
||||
readStart += numStartHardClippedBases;
|
||||
readEnd -= numEndHardClippedBases;
|
||||
*/
|
||||
// remove soft clips if necessary
|
||||
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
|
||||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
||||
numStartSoftClippedBases = 0;
|
||||
numEndSoftClippedBases = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
byte[] unclippedReadBases, unclippedReadQuals;
|
||||
|
||||
int numStartClippedBases = numStartSoftClippedBases;
|
||||
int numEndClippedBases = numEndSoftClippedBases;
|
||||
unclippedReadBases = read.getReadBases();
|
||||
unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numStartClippedBases++;
|
||||
else
|
||||
break;
|
||||
|
||||
}
|
||||
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numEndClippedBases++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
int extraOffset = Math.abs(eventLength);
|
||||
|
||||
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
||||
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
||||
|
||||
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
|
||||
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
|
||||
// check if start of read will be before start of reference context
|
||||
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
|
||||
start = ref.getWindow().getStart();
|
||||
|
||||
// check also if end of read will go beyond reference context
|
||||
if (stop > ref.getWindow().getStop())
|
||||
stop = ref.getWindow().getStop();
|
||||
|
||||
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
|
||||
// but we want to compute likelihoods in the whole region that a read might overlap
|
||||
if (stop <= start + readLength) {
|
||||
stop = start + readLength-1;
|
||||
}
|
||||
|
||||
// ok, we now figured out total number of clipped bases on both ends.
|
||||
// Figure out where we want to place the haplotype to score read against
|
||||
if (DEBUG)
|
||||
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
||||
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
|
||||
|
||||
|
||||
|
||||
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
|
||||
if (DEBUG)
|
||||
System.out.println("BAD READ!!");
|
||||
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
readLikelihoods[readIdx][j]= 0;
|
||||
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
|
||||
if (indelLikelihoodMap.containsKey(p)) {
|
||||
HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
|
||||
int j=0;
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
readLikelihoods[readIdx][j++] = el.get(a);
|
||||
}
|
||||
}
|
||||
else {
|
||||
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
continue;
|
||||
|
||||
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double[] recalQuals = null;
|
||||
|
||||
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
double[] recalCDP = null;
|
||||
if (getGapPenaltiesFromFile) {
|
||||
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
|
||||
|
||||
recalQuals = new double[read.getReadLength()];
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = -((double)qualityScore)/10.0;
|
||||
}
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
if (DEBUG) {
|
||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||
read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString());
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%c",bases[k]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%.0f ",recalQuals[k]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
final int trailingBases = 3;
|
||||
|
||||
long readStart = read.getUnclippedStart();
|
||||
long readEnd = read.getUnclippedEnd();
|
||||
|
||||
int numStartSoftClippedBases, numEndSoftClippedBases;
|
||||
|
||||
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
|
||||
// but they're actually consistent with the insertion!
|
||||
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
|
||||
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
|
||||
long eventStartPos = ref.getLocus().getStart();
|
||||
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
|
||||
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
|
||||
|
||||
// check for hard clips (never consider these bases):
|
||||
/* Cigar c = read.getCigar();
|
||||
CigarElement first = c.getCigarElement(0);
|
||||
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
||||
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
|
||||
|
||||
if (first.getOperator() == CigarOperator.H) {
|
||||
numStartHardClippedBases = first.getLength();
|
||||
}
|
||||
|
||||
if (last.getOperator() == CigarOperator.H) {
|
||||
numEndHardClippedBases = last.getLength();
|
||||
}
|
||||
|
||||
// correct for hard clips
|
||||
numStartSoftClippedBases -= numStartHardClippedBases;
|
||||
numEndSoftClippedBases -= numEndHardClippedBases;
|
||||
readStart += numStartHardClippedBases;
|
||||
readEnd -= numEndHardClippedBases;
|
||||
*/
|
||||
// remove soft clips if necessary
|
||||
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
|
||||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
||||
numStartSoftClippedBases = 0;
|
||||
numEndSoftClippedBases = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
byte[] unclippedReadBases, unclippedReadQuals;
|
||||
|
||||
int numStartClippedBases = numStartSoftClippedBases;
|
||||
int numEndClippedBases = numEndSoftClippedBases;
|
||||
unclippedReadBases = read.getReadBases();
|
||||
unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numStartClippedBases++;
|
||||
else
|
||||
break;
|
||||
|
||||
}
|
||||
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numEndClippedBases++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
int extraOffset = Math.abs(eventLength);
|
||||
|
||||
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
||||
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
||||
|
||||
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
|
||||
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
|
||||
// check if start of read will be before start of reference context
|
||||
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
|
||||
start = ref.getWindow().getStart();
|
||||
|
||||
// check also if end of read will go beyond reference context
|
||||
if (stop > ref.getWindow().getStop())
|
||||
stop = ref.getWindow().getStop();
|
||||
|
||||
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
|
||||
// but we want to compute likelihoods in the whole region that a read might overlap
|
||||
if (stop <= start + readLength) {
|
||||
stop = start + readLength-1;
|
||||
}
|
||||
|
||||
// ok, we now figured out total number of clipped bases on both ends.
|
||||
// Figure out where we want to place the haplotype to score read against
|
||||
if (DEBUG)
|
||||
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
||||
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
|
||||
|
||||
|
||||
|
||||
LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
|
||||
|
||||
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
|
||||
if (DEBUG)
|
||||
System.out.println("BAD READ!!");
|
||||
|
||||
int j=0;
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
readEl.put(a,0.0);
|
||||
readLikelihoods[readIdx][j++] = 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
}
|
||||
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Read bases:");
|
||||
System.out.println(new String(readBases));
|
||||
}
|
||||
double[] recalCDP = null;
|
||||
if (getGapPenaltiesFromFile) {
|
||||
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
|
||||
// start and stop have indices into
|
||||
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
Haplotype haplotype = haplotypesInVC.get(j);
|
||||
|
||||
if (stop > haplotype.getStopPosition())
|
||||
stop = haplotype.getStopPosition();
|
||||
|
||||
if (start < haplotype.getStartPosition())
|
||||
start = haplotype.getStartPosition();
|
||||
|
||||
// cut haplotype bases
|
||||
long indStart = start - haplotype.getStartPosition();
|
||||
long indStop = stop - haplotype.getStartPosition();
|
||||
|
||||
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
|
||||
(int)indStart, (int)indStop);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Haplotype to test:");
|
||||
System.out.println(new String(haplotypeBases));
|
||||
System.out.println("Read bases:");
|
||||
System.out.println(new String(readBases));
|
||||
}
|
||||
|
||||
if (useAffineGapModel) {
|
||||
int j=0;
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
|
||||
double[] currentContextGOP = null;
|
||||
double[] currentContextGCP = null;
|
||||
|
||||
if (doContextDependentPenalties) {
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
if (stop > haplotype.getStopPosition())
|
||||
stop = haplotype.getStopPosition();
|
||||
|
||||
if (getGapPenaltiesFromFile) {
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
|
||||
if (start < haplotype.getStartPosition())
|
||||
start = haplotype.getStartPosition();
|
||||
|
||||
} else {
|
||||
currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
|
||||
currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||
}
|
||||
// cut haplotype bases
|
||||
long indStart = start - haplotype.getStartPosition();
|
||||
long indStop = stop - haplotype.getStartPosition();
|
||||
|
||||
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
|
||||
(int)indStart, (int)indStop);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Haplotype to test:");
|
||||
System.out.println(new String(haplotypeBases));
|
||||
}
|
||||
|
||||
|
||||
Double readLikelihood = 0.0;
|
||||
if (useAffineGapModel) {
|
||||
|
||||
double[] currentContextGOP = null;
|
||||
double[] currentContextGCP = null;
|
||||
|
||||
if (doContextDependentPenalties) {
|
||||
|
||||
if (getGapPenaltiesFromFile) {
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
|
||||
|
||||
} else {
|
||||
currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
||||
|
||||
readEl.put(a,readLikelihood);
|
||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||
}
|
||||
else
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
||||
|
||||
|
||||
}
|
||||
indelLikelihoodMap.put(p,readEl);
|
||||
}
|
||||
|
||||
readIdx++;
|
||||
}
|
||||
|
||||
|
|
@ -992,8 +1016,8 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
|
||||
}
|
||||
for (int i=0; i < haplotypesInVC.size(); i++) {
|
||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
||||
for (int i=0; i < numHaplotypes; i++) {
|
||||
for (int j=i; j < numHaplotypes; j++){
|
||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||
|
|
@ -1002,7 +1026,7 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j]))
|
||||
if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
|
||||
continue;
|
||||
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
|
||||
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
|
||||
|
|
@ -1013,7 +1037,7 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
return haplotypeLikehoodMatrix;
|
||||
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,9 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class Haplotype {
|
||||
protected byte[] bases = null;
|
||||
|
|
@ -108,11 +106,11 @@ public class Haplotype {
|
|||
return isReference;
|
||||
}
|
||||
|
||||
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
|
||||
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
|
||||
final int haplotypeSize, final int numPrefBases) {
|
||||
|
||||
|
||||
List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
|
||||
LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||
|
||||
Allele refAllele = null;
|
||||
|
||||
|
|
@ -153,11 +151,11 @@ public class Haplotype {
|
|||
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
|
||||
haplotypeString = haplotypeString.substring(0,haplotypeSize);
|
||||
|
||||
haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
|
||||
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
|
||||
|
||||
}
|
||||
|
||||
return haplotypeList;
|
||||
return haplotypeMap;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue