From 8a7f5aba4b8a97940549dd20da32f344aed237a0 Mon Sep 17 00:00:00 2001 From: delangel Date: Fri, 3 Sep 2010 00:25:34 +0000 Subject: [PATCH] First more or less sort of functional framework for statistical Indel error caller. Current implementation computes Pr(read|haplotype) based on Dindel's error model. A simple walker that takes an existing vcf, generates haplotypes around calls and computes genotype likelihoods is used to test this as first example. No attempt yet to use prior information on indel AF, nor to use multi-sample caller abilities. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4197 348d0f76-0448-11de-a6fe-93d51630548a --- .../indels/HaplotypeIndelErrorModel.java | 410 ++++++++++++++++++ .../walkers/SimpleIndelGenotyperWalker.java | 259 +++++++++++ .../sting/utils/genotype/Haplotype.java | 71 +++ 3 files changed, 740 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java new file mode 100755 index 000000000..ccdf30f06 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.AlignmentBlock; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.Haplotype; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Aug 24, 2010 + * Time: 1:31:08 PM + * To change this template use File | Settings | File Templates. + */ +public class HaplotypeIndelErrorModel { + + private final int maxReadDeletionLength; // maximum length of deletion on a read + private final double noDeletionProbability; // alpha for geometric distribution for deletion length + private final int haplotypeSize; + private final int maxReadLength; + private final int PATH_METRIC_TABLE_LENGTH; + private final int RIGHT_ALIGN_INDEX; + private final int LEFT_ALIGN_INDEX; + + private static final double INFINITE = 10000000000000.0; + + private double deletionErrorProbabilities[]; + + private double pathMetricArray[]; + private int bestStateIndexArray[][]; + + private final double logOneMinusInsertionStartProbability; + private final double logInsertionStartProbability; + private final double logInsertionEndProbability; + private final double logOneMinusInsertionEndProbability; + + public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength) { + this.maxReadDeletionLength = mrdl; + this.noDeletionProbability = 1-alpha; + this.haplotypeSize = haplotypeSize; + this.maxReadLength = maxReadLength; + + PATH_METRIC_TABLE_LENGTH = haplotypeSize+2; + RIGHT_ALIGN_INDEX = PATH_METRIC_TABLE_LENGTH-1; + LEFT_ALIGN_INDEX = 0; + + logOneMinusInsertionStartProbability = probToQual(1-insStart); + logInsertionStartProbability = probToQual(insStart); + logInsertionEndProbability = probToQual(insEnd); + logOneMinusInsertionEndProbability = probToQual(1-insEnd); + + + // initialize path metric and traceback memories for Viterbi computation + pathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; + bestStateIndexArray = new int[maxReadLength][2*(PATH_METRIC_TABLE_LENGTH)]; + + for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++) + pathMetricArray[k] = 0; + + // fill in probability for read deletion of length k = C*exp(k-1) + double prob = 1.0; + double sumProb = 0.0; + + deletionErrorProbabilities = new double[maxReadDeletionLength+1]; + + deletionErrorProbabilities[1] = noDeletionProbability; + for (int k=2; k <= maxReadDeletionLength; k++) { + deletionErrorProbabilities[k] = prob; + sumProb = sumProb + prob; + prob = prob*Math.exp(-1); + } + + // now get everything in log domain, set normalizing constant so that probabilities sum to one + deletionErrorProbabilities[1] = probToQual(deletionErrorProbabilities[1]); + for (int k=2; k <= maxReadDeletionLength; k++) { + deletionErrorProbabilities[k] = probToQual((1-noDeletionProbability)*deletionErrorProbabilities[k]/sumProb); + } + + + + } + + public static double probToQual(double prob) { + // TODO: see if I can use QualityUtils version, right now I don't want rounding or byte conversion + return -10.0*Math.log10(prob); + } + + public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { + + final long readStartPosition = read.getAlignmentStart(); + final long haplotypeStartPosition = haplotype.getStartPosition(); + final long readEndPosition = read.getUnclippedEnd(); + final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1; + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + + double[] previousPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; + int readStartIdx, initialIndexInHaplotype; + + /* List b = read.getAlignmentBlocks(); + // TODO temp hack, dont take realigned reads for now + if (b.size()>1) { + rrc++; + //System.out.format("found realigned read %d\n", rrc); + return 0.0; + + } + */ + // case 1: read started before haplotype start position, we need to iterate only over remainder of read + if (readStartPosition < haplotypeStartPosition) { + if (readEndPosition<= haplotypeEndPosition) + readStartIdx = (int)(haplotypeStartPosition - (readEndPosition - read.getReadBases().length)-1); + else + readStartIdx = (int)(haplotypeStartPosition- readStartPosition); + initialIndexInHaplotype = 0; // implicit above + + } + else { + readStartIdx = 0; + initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition); + } + + + // initialize path metric array to hard-code certainty that initial position is aligned + for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++) + pathMetricArray[k] = INFINITE; + + pathMetricArray[initialIndexInHaplotype] = 0; + + /* + System.out.println(read.getReadName()); + + System.out.println(haplotypeStartPosition); + System.out.println(readStartPosition); + System.out.println(readStartIdx); + System.out.println(initialIndexInHaplotype); + */ + + // Update path metric computations based on branch metric (Add/Compare/Select operations) + // do forward direction first, ie from anchor to end of read + // outer loop + for (int indR=readStartIdx; indR < readBases.length; indR++) { + byte readBase = readBases[indR]; + byte readQual = readQuals[indR]; + + System.arraycopy(pathMetricArray, 0, previousPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH); + + + + for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { + + + byte haplotypeBase; + if (indX > LEFT_ALIGN_INDEX && indX <= haplotypeSize) + haplotypeBase = haplotype.getBasesAsBytes()[indX-1]; + else + haplotypeBase = readBase; + + updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual, previousPathMetricArray, true); + } + } + + double[] forwardPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; + System.arraycopy(pathMetricArray, 0, forwardPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH); + double forwardMetric = MathUtils.arrayMin(pathMetricArray); + + // reinitialize path metric array to known position at start of read + // initialize path metric array to hard-code certainty that initial position is aligned + for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++) + pathMetricArray[k] = INFINITE; + + pathMetricArray[initialIndexInHaplotype] = 0; + + // do now backward direction (from anchor to start of read) + // outer loop + for (int indR=readStartIdx-1; indR >= 0; indR--) { + byte readBase = readBases[indR]; + byte readQual = readQuals[indR]; + + System.arraycopy(pathMetricArray, 0, previousPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH); + + for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { + byte haplotypeBase; + if (indX > 0 && indX <= haplotypeSize) + haplotypeBase = haplotype.getBasesAsBytes()[indX-1]; + else + haplotypeBase = readBase; + + updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual, previousPathMetricArray, false); + + } + } + + + // for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested + // in log-likelihood of best state, this isn't really necessary. +/* + int[] bestIndexArray = new int[readBases.length+1]; + + + int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray); + bestIndexArray[readBases.length] = bestIndex; + + for (int k=readBases.length-1; k>=0; k--) { + bestIndex = bestStateIndexArray[k][bestIndex]; + bestIndexArray[k] = bestIndex; + } + + + System.out.println(read.getReadName()); + System.out.print("Haplotype:"); + + for (int k=0; k = indX-this.maxReadDeletionLength; indXold--){ + + if (indXold < 0) + break; + + // fetch path metric and add branch metric + bmetric = previousPathMetricArray[indXold] + deletionErrorProbabilities[indX-indXold] + + logOneMinusInsertionStartProbability + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indXold; + } + } + if (indX == RIGHT_ALIGN_INDEX) { + // special treatment for R->R case (read aligns to the right of haplotype) when Xold = X = R + bmetric = previousPathMetricArray[indX] + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indX; + } + + + } + } + } + else { + for (int indXold = indX+1; indXold <=this.maxReadDeletionLength + indX; indXold++){ + if (indXold > this.haplotypeSize+1) + break; + + // fetch path metric and add branch metric + bmetric = previousPathMetricArray[indXold] + deletionErrorProbabilities[indXold-indX] + + logOneMinusInsertionStartProbability + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indXold; + } + if (indX == LEFT_ALIGN_INDEX) { + // special treatment for R->R case (read aligns to the right of haplotype) when Xold = X = R + bmetric = previousPathMetricArray[indX] + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indX; + } + + + } + } + + } + // now Pr(Xb',Ib'=0| Xb,Ib=1): an insertion ended. Only case if Xb'=Xb+1 (walk diag down in path array). + + if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) { + if (isForward) + bmetric = previousPathMetricArray[indX-1+PATH_METRIC_TABLE_LENGTH] + logInsertionEndProbability + pBaseRead; + else + bmetric = previousPathMetricArray[indX+1+PATH_METRIC_TABLE_LENGTH] + logInsertionEndProbability + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + if (isForward) + bestMetricIndex = indX-1 + PATH_METRIC_TABLE_LENGTH; + else + bestMetricIndex = indX+1 + PATH_METRIC_TABLE_LENGTH; + + } + } + // record best path metric + pathMetricArray[indX] = bestMetric; + bestStateIndexArray[indR][indX] = bestMetricIndex; + + // now Pr(Xb', Ib'=1 | Xb, Ib=0) : an insertion started: (walk right in path array) + bestMetric = INFINITE; + pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, indX, 1); + + bmetric = previousPathMetricArray[indX] + logInsertionStartProbability + pBaseRead; + if (bmetric < bestMetric) { //if superfluous, could clean up + bestMetric = bmetric; + bestMetricIndex = indX; + } + + // final case: Pr(Xb', Ib'=1 | Xb, Ib=1): insertion continues, also walk right in path array + if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) { + bmetric = previousPathMetricArray[indX+PATH_METRIC_TABLE_LENGTH] + logOneMinusInsertionEndProbability + pBaseRead; + if (bmetric < bestMetric) { //if superfluous, could clean up + bestMetric = bmetric; + bestMetricIndex = indX+PATH_METRIC_TABLE_LENGTH; + } + + // record best path metric + pathMetricArray[indX+PATH_METRIC_TABLE_LENGTH] = bestMetric; + bestStateIndexArray[indR][indX+PATH_METRIC_TABLE_LENGTH] = bestMetricIndex; + } + } + + private double getProbabilityOfReadBaseGivenXandI(byte haplotypeBase, byte readBase, byte readQual, int indX, int indI) { + + + if (indX == this.haplotypeSize || indX == 0) { + // X = L or R + double baseProb = QualityUtils.qualToProb(readQual); + return probToQual(baseProb); + } + + if (haplotypeBase == readBase) { + double baseProb = QualityUtils.qualToProb(readQual); + return probToQual(baseProb); + } + else { + return (double)(readQual); + } + // TODO - optimize for speed by avoiding so many log conversions, can use a LUT + + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java new file mode 100755 index 000000000..c043c7137 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers; + +import net.sf.samtools.SAMRecord; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.CircularArray; +import org.broadinstitute.sting.utils.genotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.*; + +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Window; + + +@Reference(window=@Window(start=-5,stop=50)) +@Allows({DataSource.READS, DataSource.REFERENCE}) +public class SimpleIndelGenotyperWalker extends RefWalker { + @Output + PrintStream out; + @Argument(fullName="maxReadDeletionLength",shortName="maxReadDeletionLength",doc="Max deletion length allowed when aligning reads to candidate haplotypes.",required=false) + int maxReadDeletionLength = 3; + + @Argument(fullName="insertionStartProbability",shortName="insertionStartProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) + double insertionStartProbability = 0.01; + + @Argument(fullName="insertionEndProbability",shortName="insertionEndProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) + double insertionEndProbability = 0.5; + + @Argument(fullName="alphaDeletionProbability",shortName="alphaDeletionProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) + double alphaDeletionProbability = 0.01; + + + + + @Override + public boolean generateExtendedEvents() { return true; } + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + + private HaplotypeIndelErrorModel model; + + private static final int HAPLOTYPE_SIZE = 20; + private static final int MAX_READ_LENGTH = 200; // TODO- make this dynamic + + + + + @Override + public void initialize() { + model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, + insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH); + + } + + + /* private void countIndels(ReadBackedExtendedEventPileup p) { + for ( ExtendedEventPileupElement pe : p.toExtendedIterable() ) { + if ( ! pe.isIndel() ) continue; + if ( pe.getEventLength() > MAX_LENGTH ) continue; + if ( pe.isInsertion() ) insCounts[pe.getEventLength()-1]++; + else delCounts[pe.getEventLength()-1]++; + } + } + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + if (!context.hasBasePileup()) + return 0; + + VariantContext vc = tracker.getVariantContext(ref, "indels", null, context.getLocation(), true); + // ignore places where we don't have a variant + if ( vc == null ) + return 0; + + + if (!vc.isIndel()) + return 0; + + + List haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, HAPLOTYPE_SIZE); + + // combine likelihoods for all possible haplotype pair (n*(n-1)/2 combinations) + + double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; + double bestLikelihood = 1e13; + // for given allele haplotype, compute likelihood of read pileup given haplotype + ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); + + + int bestIndexI =0, bestIndexJ=0; + for (int i=0; i < haplotypesInVC.size(); i++) { + for (int j=i; j < haplotypesInVC.size(); j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + for (SAMRecord read : pileup.getReads()) { + // compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj) + + double readLikelihood[] = new double[2]; + readLikelihood[0]= -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read)/10.0; + if (i != j) + readLikelihood[1] = -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read)/10.0; + else + readLikelihood[1] = readLikelihood[0]; + + // likelihood is by convention -10*log10(pr(r|h)) + // sumlog10 computes 10^x[0]+10^x[1]+... + double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; + haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); + + } + + + if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) { + bestIndexI = i; + bestIndexJ = j; + bestLikelihood = haplotypeLikehoodMatrix[i][j]; + } + } + } + + //we say that most likely genotype at site is index (i,j) maximizing likelihood matrix + String type; + if (vc.isDeletion()) + type = "DEL"; + else if (vc.isInsertion()) + type = "INS"; + else + type = "OTH"; + + type += vc.getIndelLengths().toString(); + + + out.format("%s %d %s ",vc.getChr(), vc.getStart(), type); + + Genotype originalGenotype = vc.getGenotype("NA12878"); + + String oldG, newG; + if (originalGenotype.isHomRef()) + oldG = "HOMREF"; + else if (originalGenotype.isHet()) + oldG = "HET"; + else if (originalGenotype.isHomVar()) + oldG = "HOMVAR"; + else + oldG = "OTHER"; + + int x = bestIndexI+bestIndexJ; + if (x == 0) + newG = "HOMREF"; + else if (x == 1) + newG = "HET"; + else if (x == 2) + newG = "HOMVAR"; + else + newG = "OTHER"; + + out.format("NewG %s OldG %s\n", oldG, newG); + + +/* + if ( context.hasExtendedEventPileup() ) { + + // if we got indels at current position: + + ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getPileupWithoutMappingQualityZeroReads(); + if ( pileup.size() < MIN_COVERAGE ) return 0; + + if ( pileup.getNumberOfDeletions() + pileup.getNumberOfInsertions() > MAX_INDELS ) { + // we got too many indel events. Maybe it's even a true event, and what we are looking for are + // errors rather than true calls. Hence, we do not need these indels. We have to 1) discard + // all remaining indels from the buffer: if they are still in the buffer, they are too close + // to the current position; and 2) make sure that the next position at which we attempt to count again is + // sufficiently far *after* the current position. + // System.out.println("Non countable indel event at "+pileup.getLocation()); + countableIndelBuffer.clear(); + coverageBuffer.clear(); // we do not want to count observations (read bases) around non-countable indel as well + skipToLoc = GenomeLocParser.createGenomeLoc(pileup.getLocation().getContigIndex(),pileup.getLocation().getStop()+pileup.getMaxDeletionLength()+MIN_DISTANCE+1); + // System.out.println("Skip to "+skipToLoc); + } else { + // pileup does not contain too many indels, we need to store them in the buffer and count them later, + // if a non-countable indel event(s) do not show up too soon: + countableIndelBuffer.add(pileup); + } + return 0; + } + */ + + + return 1; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return value+sum; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public void onTraversalDone(Integer result) { + } + +} + diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 307744850..a92d9f242 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -24,11 +24,22 @@ package org.broadinstitute.sting.utils.genotype; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; +import java.util.Set; public class Haplotype { protected byte[] bases = null; protected double[] quals = null; + private GenomeLoc genomeLocation = null; + private boolean isReference = false; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -56,6 +67,16 @@ public class Haplotype { this(bases, 0); } + public Haplotype(byte[] bases, GenomeLoc loc) { + this(bases); + this.genomeLocation = loc; + } + + public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) { + this(bases, loc); + this.isReference = isRef; + } + public String toString() { return new String(this.bases); } @@ -73,4 +94,54 @@ public class Haplotype { public byte[] getBasesAsBytes() { return bases; } + + public long getStartPosition() { + return genomeLocation.getStart(); + } + + public boolean isReference() { + return isReference; + } + + + public static List makeHaplotypeListFromVariantContextAlleles(VariantContext vc, ReferenceContext ref, final int haplotypeSize) { + + + List haplotypeList = new ArrayList(); + + byte[] refBases = ref.getBases(); + + int numPrefBases = 5; // haplotypeSize/2; + + int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart()); + //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event + + + byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); + byte[] basesAfterVariant = Arrays.copyOfRange(refBases, + startIdxInReference+numPrefBases+vc.getReference().getBases().length, refBases.length); + + + // Create location for all haplotypes + long startLoc = ref.getWindow().getStart() + startIdxInReference; + long stopLoc = startLoc + haplotypeSize-1; + + GenomeLoc locus = GenomeLocParser.createGenomeLoc(ref.getLocus().getContigIndex(),startLoc, + stopLoc); + + + for (Allele a : vc.getAlleles()) { + + byte[] alleleBases = a.getBases(); + // use string concatenation + String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant); + haplotypeString = haplotypeString.substring(0,haplotypeSize); + + haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference())); + + } + + return haplotypeList; + } + }