diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java new file mode 100755 index 000000000..ccdf30f06 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.AlignmentBlock; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.Haplotype; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Aug 24, 2010 + * Time: 1:31:08 PM + * To change this template use File | Settings | File Templates. + */ +public class HaplotypeIndelErrorModel { + + private final int maxReadDeletionLength; // maximum length of deletion on a read + private final double noDeletionProbability; // alpha for geometric distribution for deletion length + private final int haplotypeSize; + private final int maxReadLength; + private final int PATH_METRIC_TABLE_LENGTH; + private final int RIGHT_ALIGN_INDEX; + private final int LEFT_ALIGN_INDEX; + + private static final double INFINITE = 10000000000000.0; + + private double deletionErrorProbabilities[]; + + private double pathMetricArray[]; + private int bestStateIndexArray[][]; + + private final double logOneMinusInsertionStartProbability; + private final double logInsertionStartProbability; + private final double logInsertionEndProbability; + private final double logOneMinusInsertionEndProbability; + + public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength) { + this.maxReadDeletionLength = mrdl; + this.noDeletionProbability = 1-alpha; + this.haplotypeSize = haplotypeSize; + this.maxReadLength = maxReadLength; + + PATH_METRIC_TABLE_LENGTH = haplotypeSize+2; + RIGHT_ALIGN_INDEX = PATH_METRIC_TABLE_LENGTH-1; + LEFT_ALIGN_INDEX = 0; + + logOneMinusInsertionStartProbability = probToQual(1-insStart); + logInsertionStartProbability = probToQual(insStart); + logInsertionEndProbability = probToQual(insEnd); + logOneMinusInsertionEndProbability = probToQual(1-insEnd); + + + // initialize path metric and traceback memories for Viterbi computation + pathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; + bestStateIndexArray = new int[maxReadLength][2*(PATH_METRIC_TABLE_LENGTH)]; + + for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++) + pathMetricArray[k] = 0; + + // fill in probability for read deletion of length k = C*exp(k-1) + double prob = 1.0; + double sumProb = 0.0; + + deletionErrorProbabilities = new double[maxReadDeletionLength+1]; + + deletionErrorProbabilities[1] = noDeletionProbability; + for (int k=2; k <= maxReadDeletionLength; k++) { + deletionErrorProbabilities[k] = prob; + sumProb = sumProb + prob; + prob = prob*Math.exp(-1); + } + + // now get everything in log domain, set normalizing constant so that probabilities sum to one + deletionErrorProbabilities[1] = probToQual(deletionErrorProbabilities[1]); + for (int k=2; k <= maxReadDeletionLength; k++) { + deletionErrorProbabilities[k] = probToQual((1-noDeletionProbability)*deletionErrorProbabilities[k]/sumProb); + } + + + + } + + public static double probToQual(double prob) { + // TODO: see if I can use QualityUtils version, right now I don't want rounding or byte conversion + return -10.0*Math.log10(prob); + } + + public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { + + final long readStartPosition = read.getAlignmentStart(); + final long haplotypeStartPosition = haplotype.getStartPosition(); + final long readEndPosition = read.getUnclippedEnd(); + final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1; + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + + double[] previousPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; + int readStartIdx, initialIndexInHaplotype; + + /* List b = read.getAlignmentBlocks(); + // TODO temp hack, dont take realigned reads for now + if (b.size()>1) { + rrc++; + //System.out.format("found realigned read %d\n", rrc); + return 0.0; + + } + */ + // case 1: read started before haplotype start position, we need to iterate only over remainder of read + if (readStartPosition < haplotypeStartPosition) { + if (readEndPosition<= haplotypeEndPosition) + readStartIdx = (int)(haplotypeStartPosition - (readEndPosition - read.getReadBases().length)-1); + else + readStartIdx = (int)(haplotypeStartPosition- readStartPosition); + initialIndexInHaplotype = 0; // implicit above + + } + else { + readStartIdx = 0; + initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition); + } + + + // initialize path metric array to hard-code certainty that initial position is aligned + for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++) + pathMetricArray[k] = INFINITE; + + pathMetricArray[initialIndexInHaplotype] = 0; + + /* + System.out.println(read.getReadName()); + + System.out.println(haplotypeStartPosition); + System.out.println(readStartPosition); + System.out.println(readStartIdx); + System.out.println(initialIndexInHaplotype); + */ + + // Update path metric computations based on branch metric (Add/Compare/Select operations) + // do forward direction first, ie from anchor to end of read + // outer loop + for (int indR=readStartIdx; indR < readBases.length; indR++) { + byte readBase = readBases[indR]; + byte readQual = readQuals[indR]; + + System.arraycopy(pathMetricArray, 0, previousPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH); + + + + for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { + + + byte haplotypeBase; + if (indX > LEFT_ALIGN_INDEX && indX <= haplotypeSize) + haplotypeBase = haplotype.getBasesAsBytes()[indX-1]; + else + haplotypeBase = readBase; + + updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual, previousPathMetricArray, true); + } + } + + double[] forwardPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; + System.arraycopy(pathMetricArray, 0, forwardPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH); + double forwardMetric = MathUtils.arrayMin(pathMetricArray); + + // reinitialize path metric array to known position at start of read + // initialize path metric array to hard-code certainty that initial position is aligned + for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++) + pathMetricArray[k] = INFINITE; + + pathMetricArray[initialIndexInHaplotype] = 0; + + // do now backward direction (from anchor to start of read) + // outer loop + for (int indR=readStartIdx-1; indR >= 0; indR--) { + byte readBase = readBases[indR]; + byte readQual = readQuals[indR]; + + System.arraycopy(pathMetricArray, 0, previousPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH); + + for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { + byte haplotypeBase; + if (indX > 0 && indX <= haplotypeSize) + haplotypeBase = haplotype.getBasesAsBytes()[indX-1]; + else + haplotypeBase = readBase; + + updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual, previousPathMetricArray, false); + + } + } + + + // for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested + // in log-likelihood of best state, this isn't really necessary. +/* + int[] bestIndexArray = new int[readBases.length+1]; + + + int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray); + bestIndexArray[readBases.length] = bestIndex; + + for (int k=readBases.length-1; k>=0; k--) { + bestIndex = bestStateIndexArray[k][bestIndex]; + bestIndexArray[k] = bestIndex; + } + + + System.out.println(read.getReadName()); + System.out.print("Haplotype:"); + + for (int k=0; k = indX-this.maxReadDeletionLength; indXold--){ + + if (indXold < 0) + break; + + // fetch path metric and add branch metric + bmetric = previousPathMetricArray[indXold] + deletionErrorProbabilities[indX-indXold] + + logOneMinusInsertionStartProbability + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indXold; + } + } + if (indX == RIGHT_ALIGN_INDEX) { + // special treatment for R->R case (read aligns to the right of haplotype) when Xold = X = R + bmetric = previousPathMetricArray[indX] + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indX; + } + + + } + } + } + else { + for (int indXold = indX+1; indXold <=this.maxReadDeletionLength + indX; indXold++){ + if (indXold > this.haplotypeSize+1) + break; + + // fetch path metric and add branch metric + bmetric = previousPathMetricArray[indXold] + deletionErrorProbabilities[indXold-indX] + + logOneMinusInsertionStartProbability + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indXold; + } + if (indX == LEFT_ALIGN_INDEX) { + // special treatment for R->R case (read aligns to the right of haplotype) when Xold = X = R + bmetric = previousPathMetricArray[indX] + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + bestMetricIndex = indX; + } + + + } + } + + } + // now Pr(Xb',Ib'=0| Xb,Ib=1): an insertion ended. Only case if Xb'=Xb+1 (walk diag down in path array). + + if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) { + if (isForward) + bmetric = previousPathMetricArray[indX-1+PATH_METRIC_TABLE_LENGTH] + logInsertionEndProbability + pBaseRead; + else + bmetric = previousPathMetricArray[indX+1+PATH_METRIC_TABLE_LENGTH] + logInsertionEndProbability + pBaseRead; + + if (bmetric < bestMetric) { + bestMetric = bmetric; + if (isForward) + bestMetricIndex = indX-1 + PATH_METRIC_TABLE_LENGTH; + else + bestMetricIndex = indX+1 + PATH_METRIC_TABLE_LENGTH; + + } + } + // record best path metric + pathMetricArray[indX] = bestMetric; + bestStateIndexArray[indR][indX] = bestMetricIndex; + + // now Pr(Xb', Ib'=1 | Xb, Ib=0) : an insertion started: (walk right in path array) + bestMetric = INFINITE; + pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, indX, 1); + + bmetric = previousPathMetricArray[indX] + logInsertionStartProbability + pBaseRead; + if (bmetric < bestMetric) { //if superfluous, could clean up + bestMetric = bmetric; + bestMetricIndex = indX; + } + + // final case: Pr(Xb', Ib'=1 | Xb, Ib=1): insertion continues, also walk right in path array + if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) { + bmetric = previousPathMetricArray[indX+PATH_METRIC_TABLE_LENGTH] + logOneMinusInsertionEndProbability + pBaseRead; + if (bmetric < bestMetric) { //if superfluous, could clean up + bestMetric = bmetric; + bestMetricIndex = indX+PATH_METRIC_TABLE_LENGTH; + } + + // record best path metric + pathMetricArray[indX+PATH_METRIC_TABLE_LENGTH] = bestMetric; + bestStateIndexArray[indR][indX+PATH_METRIC_TABLE_LENGTH] = bestMetricIndex; + } + } + + private double getProbabilityOfReadBaseGivenXandI(byte haplotypeBase, byte readBase, byte readQual, int indX, int indI) { + + + if (indX == this.haplotypeSize || indX == 0) { + // X = L or R + double baseProb = QualityUtils.qualToProb(readQual); + return probToQual(baseProb); + } + + if (haplotypeBase == readBase) { + double baseProb = QualityUtils.qualToProb(readQual); + return probToQual(baseProb); + } + else { + return (double)(readQual); + } + // TODO - optimize for speed by avoiding so many log conversions, can use a LUT + + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java new file mode 100755 index 000000000..c043c7137 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers; + +import net.sf.samtools.SAMRecord; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.CircularArray; +import org.broadinstitute.sting.utils.genotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.*; + +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Window; + + +@Reference(window=@Window(start=-5,stop=50)) +@Allows({DataSource.READS, DataSource.REFERENCE}) +public class SimpleIndelGenotyperWalker extends RefWalker { + @Output + PrintStream out; + @Argument(fullName="maxReadDeletionLength",shortName="maxReadDeletionLength",doc="Max deletion length allowed when aligning reads to candidate haplotypes.",required=false) + int maxReadDeletionLength = 3; + + @Argument(fullName="insertionStartProbability",shortName="insertionStartProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) + double insertionStartProbability = 0.01; + + @Argument(fullName="insertionEndProbability",shortName="insertionEndProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) + double insertionEndProbability = 0.5; + + @Argument(fullName="alphaDeletionProbability",shortName="alphaDeletionProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) + double alphaDeletionProbability = 0.01; + + + + + @Override + public boolean generateExtendedEvents() { return true; } + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + + private HaplotypeIndelErrorModel model; + + private static final int HAPLOTYPE_SIZE = 20; + private static final int MAX_READ_LENGTH = 200; // TODO- make this dynamic + + + + + @Override + public void initialize() { + model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, + insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH); + + } + + + /* private void countIndels(ReadBackedExtendedEventPileup p) { + for ( ExtendedEventPileupElement pe : p.toExtendedIterable() ) { + if ( ! pe.isIndel() ) continue; + if ( pe.getEventLength() > MAX_LENGTH ) continue; + if ( pe.isInsertion() ) insCounts[pe.getEventLength()-1]++; + else delCounts[pe.getEventLength()-1]++; + } + } + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + if (!context.hasBasePileup()) + return 0; + + VariantContext vc = tracker.getVariantContext(ref, "indels", null, context.getLocation(), true); + // ignore places where we don't have a variant + if ( vc == null ) + return 0; + + + if (!vc.isIndel()) + return 0; + + + List haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, HAPLOTYPE_SIZE); + + // combine likelihoods for all possible haplotype pair (n*(n-1)/2 combinations) + + double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; + double bestLikelihood = 1e13; + // for given allele haplotype, compute likelihood of read pileup given haplotype + ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); + + + int bestIndexI =0, bestIndexJ=0; + for (int i=0; i < haplotypesInVC.size(); i++) { + for (int j=i; j < haplotypesInVC.size(); j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + for (SAMRecord read : pileup.getReads()) { + // compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj) + + double readLikelihood[] = new double[2]; + readLikelihood[0]= -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read)/10.0; + if (i != j) + readLikelihood[1] = -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read)/10.0; + else + readLikelihood[1] = readLikelihood[0]; + + // likelihood is by convention -10*log10(pr(r|h)) + // sumlog10 computes 10^x[0]+10^x[1]+... + double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; + haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); + + } + + + if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) { + bestIndexI = i; + bestIndexJ = j; + bestLikelihood = haplotypeLikehoodMatrix[i][j]; + } + } + } + + //we say that most likely genotype at site is index (i,j) maximizing likelihood matrix + String type; + if (vc.isDeletion()) + type = "DEL"; + else if (vc.isInsertion()) + type = "INS"; + else + type = "OTH"; + + type += vc.getIndelLengths().toString(); + + + out.format("%s %d %s ",vc.getChr(), vc.getStart(), type); + + Genotype originalGenotype = vc.getGenotype("NA12878"); + + String oldG, newG; + if (originalGenotype.isHomRef()) + oldG = "HOMREF"; + else if (originalGenotype.isHet()) + oldG = "HET"; + else if (originalGenotype.isHomVar()) + oldG = "HOMVAR"; + else + oldG = "OTHER"; + + int x = bestIndexI+bestIndexJ; + if (x == 0) + newG = "HOMREF"; + else if (x == 1) + newG = "HET"; + else if (x == 2) + newG = "HOMVAR"; + else + newG = "OTHER"; + + out.format("NewG %s OldG %s\n", oldG, newG); + + +/* + if ( context.hasExtendedEventPileup() ) { + + // if we got indels at current position: + + ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getPileupWithoutMappingQualityZeroReads(); + if ( pileup.size() < MIN_COVERAGE ) return 0; + + if ( pileup.getNumberOfDeletions() + pileup.getNumberOfInsertions() > MAX_INDELS ) { + // we got too many indel events. Maybe it's even a true event, and what we are looking for are + // errors rather than true calls. Hence, we do not need these indels. We have to 1) discard + // all remaining indels from the buffer: if they are still in the buffer, they are too close + // to the current position; and 2) make sure that the next position at which we attempt to count again is + // sufficiently far *after* the current position. + // System.out.println("Non countable indel event at "+pileup.getLocation()); + countableIndelBuffer.clear(); + coverageBuffer.clear(); // we do not want to count observations (read bases) around non-countable indel as well + skipToLoc = GenomeLocParser.createGenomeLoc(pileup.getLocation().getContigIndex(),pileup.getLocation().getStop()+pileup.getMaxDeletionLength()+MIN_DISTANCE+1); + // System.out.println("Skip to "+skipToLoc); + } else { + // pileup does not contain too many indels, we need to store them in the buffer and count them later, + // if a non-countable indel event(s) do not show up too soon: + countableIndelBuffer.add(pileup); + } + return 0; + } + */ + + + return 1; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return value+sum; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public void onTraversalDone(Integer result) { + } + +} + diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 307744850..a92d9f242 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -24,11 +24,22 @@ package org.broadinstitute.sting.utils.genotype; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; +import java.util.Set; public class Haplotype { protected byte[] bases = null; protected double[] quals = null; + private GenomeLoc genomeLocation = null; + private boolean isReference = false; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -56,6 +67,16 @@ public class Haplotype { this(bases, 0); } + public Haplotype(byte[] bases, GenomeLoc loc) { + this(bases); + this.genomeLocation = loc; + } + + public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) { + this(bases, loc); + this.isReference = isRef; + } + public String toString() { return new String(this.bases); } @@ -73,4 +94,54 @@ public class Haplotype { public byte[] getBasesAsBytes() { return bases; } + + public long getStartPosition() { + return genomeLocation.getStart(); + } + + public boolean isReference() { + return isReference; + } + + + public static List makeHaplotypeListFromVariantContextAlleles(VariantContext vc, ReferenceContext ref, final int haplotypeSize) { + + + List haplotypeList = new ArrayList(); + + byte[] refBases = ref.getBases(); + + int numPrefBases = 5; // haplotypeSize/2; + + int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart()); + //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event + + + byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); + byte[] basesAfterVariant = Arrays.copyOfRange(refBases, + startIdxInReference+numPrefBases+vc.getReference().getBases().length, refBases.length); + + + // Create location for all haplotypes + long startLoc = ref.getWindow().getStart() + startIdxInReference; + long stopLoc = startLoc + haplotypeSize-1; + + GenomeLoc locus = GenomeLocParser.createGenomeLoc(ref.getLocus().getContigIndex(),startLoc, + stopLoc); + + + for (Allele a : vc.getAlleles()) { + + byte[] alleleBases = a.getBases(); + // use string concatenation + String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant); + haplotypeString = haplotypeString.substring(0,haplotypeSize); + + haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference())); + + } + + return haplotypeList; + } + }