First step in refactoring UG way of storing indel likelihoods - main motive is that rank sum annotations require per-read quality or likelihood information, and even the question "what allele of a variant is present in a read" which is trivial for SNPs may not be that straightforward for indels.

This step just changes storage of likelihoods so now we have, instead of an internal matrix, a class member which stores, as a hash table, a mapping from pileup element to an (allele, likelihood) pair. There's no functional change aside from internal data storage.
As a bonus, we get for free a 2-3x improvement in speed in calling because redundant likelihood computations are removed.
Next step will hook this up to, and redefine annotation engine interaction with UG for indel case.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2011-05-15 23:04:11 +00:00
parent 3ccc08ace4
commit 5a7444e186
4 changed files with 297 additions and 261 deletions

View File

@ -34,10 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -57,9 +59,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private boolean DEBUG = false;
private PairHMMIndelErrorModel pairModel;
// gdebug removeme
// todo -cleanup
private HaplotypeIndelErrorModel model;
private HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap;
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
// gdebug removeme
// todo -cleanup
private HaplotypeIndelErrorModel model;
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
//
private GenomeLoc lastSiteVisited;
@ -84,13 +90,18 @@ private HaplotypeIndelErrorModel model;
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
}
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
indelLikelihoodMap = new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
}
@ -215,7 +226,7 @@ private HaplotypeIndelErrorModel model;
if (DEBUG) {
int icount = indelPileup.getNumberOfInsertions();
int dcount = indelPileup.getNumberOfDeletions();
int dcount = indelPileup.getNumberOfDeletions();
if (icount + dcount > 0)
{
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
@ -294,7 +305,8 @@ private HaplotypeIndelErrorModel model;
// starting a new site: clear allele list
alleleList.clear();
lastSiteVisited = ref.getLocus().clone();
indelLikelihoodMap.clear();
haplotypeMap.clear();
if (getAlleleListFromVCF) {
@ -341,7 +353,7 @@ private HaplotypeIndelErrorModel model;
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
// assume only one alt allele for now
List<Haplotype> haplotypesInVC;
//List<Haplotype> haplotypesInVC;
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
@ -354,7 +366,7 @@ private HaplotypeIndelErrorModel model;
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, hsize, numPrefBases);
// For each sample, get genotype likelihoods based on pileup
@ -362,9 +374,6 @@ private HaplotypeIndelErrorModel model;
// initialize the GenotypeLikelihoods
GLs.clear();
double[][] haplotypeLikehoodMatrix;
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
@ -375,15 +384,14 @@ private HaplotypeIndelErrorModel model;
pileup = context.getBasePileup();
if (pileup != null ) {
double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel)
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref, HAPLOTYPE_SIZE, eventLength);
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, HAPLOTYPE_SIZE, eventLength, indelLikelihoodMap);
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
refAllele,
altAllele,
@ -398,4 +406,7 @@ private HaplotypeIndelErrorModel model;
return refAllele;
}
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.AlignmentBlock;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.utils.MathUtils;
@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
public class HaplotypeIndelErrorModel {
@ -419,7 +421,7 @@ public class HaplotypeIndelErrorModel {
}
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC){
public double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, HashMap<Allele,Haplotype> haplotypesInVC){
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
int i=0;
@ -429,7 +431,8 @@ public class HaplotypeIndelErrorModel {
}
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
for (int j=0; j < haplotypesInVC.size(); j++) {
int j=0;
for (Allele a: haplotypesInVC.keySet()) {
readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
if (DEBUG) {
System.out.print(read.getReadName()+" ");
@ -438,7 +441,7 @@ public class HaplotypeIndelErrorModel {
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
read.getCigarString(), readLikelihoods[i][j]);
}
j++;
}
i++;
}
@ -465,11 +468,11 @@ public class HaplotypeIndelErrorModel {
}
}
return haplotypeLikehoodMatrix;
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
}
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
private double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
int hSize = haplotypeLikehoodMatrix.length;
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];

View File

@ -29,6 +29,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.util.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
@ -41,6 +42,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -48,9 +50,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
import java.util.regex.Pattern;
@ -697,29 +697,31 @@ public class PairHMMIndelErrorModel {
}
private void fillGapProbabilities(int hIndex, int[] hrunProfile,
double[][] contextLogGapOpenProbabilities, double[][] contextLogGapContinuationProbabilities) {
private void fillGapProbabilities(int[] hrunProfile,
double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) {
// fill based on lookup table
for (int i = 0; i < hrunProfile.length; i++) {
if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) {
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
}
else {
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
}
}
}
public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
ReferenceContext ref, int haplotypeSize, int eventLength){
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
ReferenceContext ref, int haplotypeSize, int eventLength,
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
int numHaplotypes = haplotypeMap.size();
double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
int readIdx=0;
double[][] contextLogGapOpenProbabilities = null;
double[][] contextLogGapContinuationProbabilities = null;
LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
LinkedHashMap<Allele,double[]> gapContProbabilityMap = new LinkedHashMap<Allele,double[]>();
if (DEBUG) {
System.out.println("Reference bases:");
@ -727,15 +729,15 @@ public class PairHMMIndelErrorModel {
}
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
// will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes.
// will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
for (int j=0; j < haplotypesInVC.size(); j++) {
Haplotype haplotype = haplotypesInVC.get(j);
for (Allele a: haplotypeMap.keySet()) {
Haplotype haplotype = haplotypeMap.get(a);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
if (contextLogGapOpenProbabilities == null) {
contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
}
double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[haplotypeBases.length];
getContextHomopolymerLength(haplotypeBases,hrunProfile);
@ -746,239 +748,261 @@ public class PairHMMIndelErrorModel {
System.out.format("%d",hrunProfile[i]);
System.out.println();
}
fillGapProbabilities(j, hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
}
}
for (SAMRecord pread : pileup.getReads()) {
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(pread);
if (read == null)
continue;
for (PileupElement p: pileup) {
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
continue;
}
double[] recalQuals = null;
if (getGapPenaltiesFromFile) {
RecalDataManager.parseSAMRecord( read, RAC );
recalQuals = new double[read.getReadLength()];
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = -((double)qualityScore)/10.0;
}
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
if (DEBUG) {
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
read.getAlignmentStart(),
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
read.getCigarString());
byte[] bases = read.getReadBases();
for (int k = 0; k < recalQuals.length; k++) {
System.out.format("%c",bases[k]);
}
System.out.println();
for (int k = 0; k < recalQuals.length; k++) {
System.out.format("%.0f ",recalQuals[k]);
}
System.out.println();
}
}
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
long readStart = read.getUnclippedStart();
long readEnd = read.getUnclippedEnd();
int numStartSoftClippedBases, numEndSoftClippedBases;
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
// but they're actually consistent with the insertion!
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped)
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
// check for hard clips (never consider these bases):
/* Cigar c = read.getCigar();
CigarElement first = c.getCigarElement(0);
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartHardClippedBases = first.getLength();
}
if (last.getOperator() == CigarOperator.H) {
numEndHardClippedBases = last.getLength();
}
// correct for hard clips
numStartSoftClippedBases -= numStartHardClippedBases;
numEndSoftClippedBases -= numEndHardClippedBases;
readStart += numStartHardClippedBases;
readEnd -= numEndHardClippedBases;
*/
// remove soft clips if necessary
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
numStartSoftClippedBases = 0;
numEndSoftClippedBases = 0;
}
byte[] unclippedReadBases, unclippedReadQuals;
int numStartClippedBases = numStartSoftClippedBases;
int numEndClippedBases = numEndSoftClippedBases;
unclippedReadBases = read.getReadBases();
unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numStartClippedBases++;
else
break;
}
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numEndClippedBases++;
else
break;
}
int extraOffset = Math.abs(eventLength);
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
// check if start of read will be before start of reference context
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
start = ref.getWindow().getStart();
// check also if end of read will go beyond reference context
if (stop > ref.getWindow().getStop())
stop = ref.getWindow().getStop();
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
// but we want to compute likelihoods in the whole region that a read might overlap
if (stop <= start + readLength) {
stop = start + readLength-1;
}
// ok, we now figured out total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
if (DEBUG)
System.out.println("BAD READ!!");
for (int j=0; j < haplotypesInVC.size(); j++) {
readLikelihoods[readIdx][j]= 0;
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
if (indelLikelihoodMap.containsKey(p)) {
HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
int j=0;
for (Allele a: haplotypeMap.keySet()) {
readLikelihoods[readIdx][j++] = el.get(a);
}
}
else {
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
continue;
}
double[] recalQuals = null;
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
double[] recalCDP = null;
if (getGapPenaltiesFromFile) {
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
RecalDataManager.parseSAMRecord( read, RAC );
recalQuals = new double[read.getReadLength()];
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = -((double)qualityScore)/10.0;
}
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
if (DEBUG) {
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
read.getAlignmentStart(),
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
read.getCigarString());
byte[] bases = read.getReadBases();
for (int k = 0; k < recalQuals.length; k++) {
System.out.format("%c",bases[k]);
}
System.out.println();
for (int k = 0; k < recalQuals.length; k++) {
System.out.format("%.0f ",recalQuals[k]);
}
System.out.println();
}
}
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
long readStart = read.getUnclippedStart();
long readEnd = read.getUnclippedEnd();
int numStartSoftClippedBases, numEndSoftClippedBases;
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
// but they're actually consistent with the insertion!
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped)
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
// check for hard clips (never consider these bases):
/* Cigar c = read.getCigar();
CigarElement first = c.getCigarElement(0);
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartHardClippedBases = first.getLength();
}
if (last.getOperator() == CigarOperator.H) {
numEndHardClippedBases = last.getLength();
}
// correct for hard clips
numStartSoftClippedBases -= numStartHardClippedBases;
numEndSoftClippedBases -= numEndHardClippedBases;
readStart += numStartHardClippedBases;
readEnd -= numEndHardClippedBases;
*/
// remove soft clips if necessary
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
numStartSoftClippedBases = 0;
numEndSoftClippedBases = 0;
}
byte[] unclippedReadBases, unclippedReadQuals;
int numStartClippedBases = numStartSoftClippedBases;
int numEndClippedBases = numEndSoftClippedBases;
unclippedReadBases = read.getReadBases();
unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numStartClippedBases++;
else
break;
}
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numEndClippedBases++;
else
break;
}
int extraOffset = Math.abs(eventLength);
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
// check if start of read will be before start of reference context
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
start = ref.getWindow().getStart();
// check also if end of read will go beyond reference context
if (stop > ref.getWindow().getStop())
stop = ref.getWindow().getStop();
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
// but we want to compute likelihoods in the whole region that a read might overlap
if (stop <= start + readLength) {
stop = start + readLength-1;
}
// ok, we now figured out total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
if (DEBUG)
System.out.println("BAD READ!!");
int j=0;
for (Allele a: haplotypeMap.keySet()) {
readEl.put(a,0.0);
readLikelihoods[readIdx][j++] = 0.0;
}
}
else {
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
}
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
if (DEBUG) {
System.out.println("Read bases:");
System.out.println(new String(readBases));
}
double[] recalCDP = null;
if (getGapPenaltiesFromFile) {
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
// start and stop have indices into
for (int j=0; j < haplotypesInVC.size(); j++) {
Haplotype haplotype = haplotypesInVC.get(j);
if (stop > haplotype.getStopPosition())
stop = haplotype.getStopPosition();
if (start < haplotype.getStartPosition())
start = haplotype.getStartPosition();
// cut haplotype bases
long indStart = start - haplotype.getStartPosition();
long indStop = stop - haplotype.getStartPosition();
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
(int)indStart, (int)indStop);
}
if (DEBUG) {
System.out.println("Haplotype to test:");
System.out.println(new String(haplotypeBases));
System.out.println("Read bases:");
System.out.println(new String(readBases));
}
if (useAffineGapModel) {
int j=0;
for (Allele a: haplotypeMap.keySet()) {
double[] currentContextGOP = null;
double[] currentContextGCP = null;
if (doContextDependentPenalties) {
Haplotype haplotype = haplotypeMap.get(a);
if (stop > haplotype.getStopPosition())
stop = haplotype.getStopPosition();
if (getGapPenaltiesFromFile) {
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
if (start < haplotype.getStartPosition())
start = haplotype.getStartPosition();
} else {
currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
}
// cut haplotype bases
long indStart = start - haplotype.getStartPosition();
long indStop = stop - haplotype.getStartPosition();
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
(int)indStart, (int)indStop);
if (DEBUG) {
System.out.println("Haplotype to test:");
System.out.println(new String(haplotypeBases));
}
Double readLikelihood = 0.0;
if (useAffineGapModel) {
double[] currentContextGOP = null;
double[] currentContextGCP = null;
if (doContextDependentPenalties) {
if (getGapPenaltiesFromFile) {
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
} else {
currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
}
}
}
else
readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
readEl.put(a,readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
else
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
}
indelLikelihoodMap.put(p,readEl);
}
readIdx++;
}
@ -992,8 +1016,8 @@ public class PairHMMIndelErrorModel {
}
}
for (int i=0; i < haplotypesInVC.size(); i++) {
for (int j=i; j < haplotypesInVC.size(); j++){
for (int i=0; i < numHaplotypes; i++) {
for (int j=i; j < numHaplotypes; j++){
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
@ -1002,7 +1026,7 @@ public class PairHMMIndelErrorModel {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j]))
if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
continue;
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
@ -1013,7 +1037,7 @@ public class PairHMMIndelErrorModel {
}
}
return haplotypeLikehoodMatrix;
return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
}

View File

@ -31,9 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
public class Haplotype {
protected byte[] bases = null;
@ -108,11 +106,11 @@ public class Haplotype {
return isReference;
}
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
final int haplotypeSize, final int numPrefBases) {
List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
Allele refAllele = null;
@ -153,11 +151,11 @@ public class Haplotype {
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
haplotypeString = haplotypeString.substring(0,haplotypeSize);
haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
}
return haplotypeList;
return haplotypeMap;
}
}