First more or less sort of functional framework for statistical Indel error caller. Current implementation computes Pr(read|haplotype) based on Dindel's error model. A simple walker that takes an existing vcf, generates haplotypes around calls and computes genotype likelihoods is used to test this as first example. No attempt yet to use prior information on indel AF, nor to use multi-sample caller abilities.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4197 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-09-03 00:25:34 +00:00
parent a1cf3398a5
commit 8a7f5aba4b
3 changed files with 740 additions and 0 deletions

View File

@ -0,0 +1,410 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.AlignmentBlock;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: Aug 24, 2010
* Time: 1:31:08 PM
* To change this template use File | Settings | File Templates.
*/
public class HaplotypeIndelErrorModel {
private final int maxReadDeletionLength; // maximum length of deletion on a read
private final double noDeletionProbability; // alpha for geometric distribution for deletion length
private final int haplotypeSize;
private final int maxReadLength;
private final int PATH_METRIC_TABLE_LENGTH;
private final int RIGHT_ALIGN_INDEX;
private final int LEFT_ALIGN_INDEX;
private static final double INFINITE = 10000000000000.0;
private double deletionErrorProbabilities[];
private double pathMetricArray[];
private int bestStateIndexArray[][];
private final double logOneMinusInsertionStartProbability;
private final double logInsertionStartProbability;
private final double logInsertionEndProbability;
private final double logOneMinusInsertionEndProbability;
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength) {
this.maxReadDeletionLength = mrdl;
this.noDeletionProbability = 1-alpha;
this.haplotypeSize = haplotypeSize;
this.maxReadLength = maxReadLength;
PATH_METRIC_TABLE_LENGTH = haplotypeSize+2;
RIGHT_ALIGN_INDEX = PATH_METRIC_TABLE_LENGTH-1;
LEFT_ALIGN_INDEX = 0;
logOneMinusInsertionStartProbability = probToQual(1-insStart);
logInsertionStartProbability = probToQual(insStart);
logInsertionEndProbability = probToQual(insEnd);
logOneMinusInsertionEndProbability = probToQual(1-insEnd);
// initialize path metric and traceback memories for Viterbi computation
pathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH];
bestStateIndexArray = new int[maxReadLength][2*(PATH_METRIC_TABLE_LENGTH)];
for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++)
pathMetricArray[k] = 0;
// fill in probability for read deletion of length k = C*exp(k-1)
double prob = 1.0;
double sumProb = 0.0;
deletionErrorProbabilities = new double[maxReadDeletionLength+1];
deletionErrorProbabilities[1] = noDeletionProbability;
for (int k=2; k <= maxReadDeletionLength; k++) {
deletionErrorProbabilities[k] = prob;
sumProb = sumProb + prob;
prob = prob*Math.exp(-1);
}
// now get everything in log domain, set normalizing constant so that probabilities sum to one
deletionErrorProbabilities[1] = probToQual(deletionErrorProbabilities[1]);
for (int k=2; k <= maxReadDeletionLength; k++) {
deletionErrorProbabilities[k] = probToQual((1-noDeletionProbability)*deletionErrorProbabilities[k]/sumProb);
}
}
public static double probToQual(double prob) {
// TODO: see if I can use QualityUtils version, right now I don't want rounding or byte conversion
return -10.0*Math.log10(prob);
}
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {
final long readStartPosition = read.getAlignmentStart();
final long haplotypeStartPosition = haplotype.getStartPosition();
final long readEndPosition = read.getUnclippedEnd();
final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1;
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
double[] previousPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH];
int readStartIdx, initialIndexInHaplotype;
/* List<AlignmentBlock> b = read.getAlignmentBlocks();
// TODO temp hack, dont take realigned reads for now
if (b.size()>1) {
rrc++;
//System.out.format("found realigned read %d\n", rrc);
return 0.0;
}
*/
// case 1: read started before haplotype start position, we need to iterate only over remainder of read
if (readStartPosition < haplotypeStartPosition) {
if (readEndPosition<= haplotypeEndPosition)
readStartIdx = (int)(haplotypeStartPosition - (readEndPosition - read.getReadBases().length)-1);
else
readStartIdx = (int)(haplotypeStartPosition- readStartPosition);
initialIndexInHaplotype = 0; // implicit above
}
else {
readStartIdx = 0;
initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition);
}
// initialize path metric array to hard-code certainty that initial position is aligned
for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++)
pathMetricArray[k] = INFINITE;
pathMetricArray[initialIndexInHaplotype] = 0;
/*
System.out.println(read.getReadName());
System.out.println(haplotypeStartPosition);
System.out.println(readStartPosition);
System.out.println(readStartIdx);
System.out.println(initialIndexInHaplotype);
*/
// Update path metric computations based on branch metric (Add/Compare/Select operations)
// do forward direction first, ie from anchor to end of read
// outer loop
for (int indR=readStartIdx; indR < readBases.length; indR++) {
byte readBase = readBases[indR];
byte readQual = readQuals[indR];
System.arraycopy(pathMetricArray, 0, previousPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH);
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
byte haplotypeBase;
if (indX > LEFT_ALIGN_INDEX && indX <= haplotypeSize)
haplotypeBase = haplotype.getBasesAsBytes()[indX-1];
else
haplotypeBase = readBase;
updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual, previousPathMetricArray, true);
}
}
double[] forwardPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH];
System.arraycopy(pathMetricArray, 0, forwardPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH);
double forwardMetric = MathUtils.arrayMin(pathMetricArray);
// reinitialize path metric array to known position at start of read
// initialize path metric array to hard-code certainty that initial position is aligned
for (int k=0; k < 2*PATH_METRIC_TABLE_LENGTH; k++)
pathMetricArray[k] = INFINITE;
pathMetricArray[initialIndexInHaplotype] = 0;
// do now backward direction (from anchor to start of read)
// outer loop
for (int indR=readStartIdx-1; indR >= 0; indR--) {
byte readBase = readBases[indR];
byte readQual = readQuals[indR];
System.arraycopy(pathMetricArray, 0, previousPathMetricArray, 0, 2*PATH_METRIC_TABLE_LENGTH);
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
byte haplotypeBase;
if (indX > 0 && indX <= haplotypeSize)
haplotypeBase = haplotype.getBasesAsBytes()[indX-1];
else
haplotypeBase = readBase;
updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual, previousPathMetricArray, false);
}
}
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
// in log-likelihood of best state, this isn't really necessary.
/*
int[] bestIndexArray = new int[readBases.length+1];
int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray);
bestIndexArray[readBases.length] = bestIndex;
for (int k=readBases.length-1; k>=0; k--) {
bestIndex = bestStateIndexArray[k][bestIndex];
bestIndexArray[k] = bestIndex;
}
System.out.println(read.getReadName());
System.out.print("Haplotype:");
for (int k=0; k <haplotype.getBasesAsBytes().length; k++) {
System.out.format("%c ", haplotype.getBasesAsBytes()[k]);
}
System.out.println();
System.out.print("Read bases: ");
for (int k=0; k <readBases.length; k++) {
System.out.format("%c ", readBases[k]);
}
System.out.println();
System.out.print("Alignment: ");
for (int k=0; k <readBases.length; k++) {
System.out.format("%d ", bestIndexArray[k]);
}
System.out.println();
*/
// now just take optimum along all path metrics: that's the log likelihood of best alignment
double backwardMetric = MathUtils.arrayMin(pathMetricArray);
return forwardMetric + backwardMetric;
}
private void updatePathMetrics(byte haplotypeBase, int indX, int indR, byte readBase, byte readQual,
double[] previousPathMetricArray, boolean isForward) {
double bmetric;
// Pr(Rb', Xb',Ib'| Xb,Ib=0) is populated starting on Xb , =
// Pr(Rb'|Xb',Ib') Pr(Xb',Ib'|Xb,Ib) =
// Pr(Rb'|Xb',Ib') Pr(Xb'|Ib',Xb,Ib) Pr(Ib'|Ib)
// start with case Ib=0, Ib'=0: no insertion
double bestMetric = INFINITE;
int bestMetricIndex = 0;
double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, indX, 0);
if (isForward) {
if (indX==LEFT_ALIGN_INDEX){
// there can't be a transition into Xb=0
// special case: only one incoming path
// record best path metric
// pathMetricArray[indX] = INFINITE;
// bestStateIndexArray[indR][indX] = 0;
// bestMetric = INFINITE;
// bestMetricIndex = 0;
}
else {
for (int indXold = indX-1; indXold >= indX-this.maxReadDeletionLength; indXold--){
if (indXold < 0)
break;
// fetch path metric and add branch metric
bmetric = previousPathMetricArray[indXold] + deletionErrorProbabilities[indX-indXold] +
logOneMinusInsertionStartProbability + pBaseRead;
if (bmetric < bestMetric) {
bestMetric = bmetric;
bestMetricIndex = indXold;
}
}
if (indX == RIGHT_ALIGN_INDEX) {
// special treatment for R->R case (read aligns to the right of haplotype) when Xold = X = R
bmetric = previousPathMetricArray[indX] + pBaseRead;
if (bmetric < bestMetric) {
bestMetric = bmetric;
bestMetricIndex = indX;
}
}
}
}
else {
for (int indXold = indX+1; indXold <=this.maxReadDeletionLength + indX; indXold++){
if (indXold > this.haplotypeSize+1)
break;
// fetch path metric and add branch metric
bmetric = previousPathMetricArray[indXold] + deletionErrorProbabilities[indXold-indX] +
logOneMinusInsertionStartProbability + pBaseRead;
if (bmetric < bestMetric) {
bestMetric = bmetric;
bestMetricIndex = indXold;
}
if (indX == LEFT_ALIGN_INDEX) {
// special treatment for R->R case (read aligns to the right of haplotype) when Xold = X = R
bmetric = previousPathMetricArray[indX] + pBaseRead;
if (bmetric < bestMetric) {
bestMetric = bmetric;
bestMetricIndex = indX;
}
}
}
}
// now Pr(Xb',Ib'=0| Xb,Ib=1): an insertion ended. Only case if Xb'=Xb+1 (walk diag down in path array).
if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) {
if (isForward)
bmetric = previousPathMetricArray[indX-1+PATH_METRIC_TABLE_LENGTH] + logInsertionEndProbability + pBaseRead;
else
bmetric = previousPathMetricArray[indX+1+PATH_METRIC_TABLE_LENGTH] + logInsertionEndProbability + pBaseRead;
if (bmetric < bestMetric) {
bestMetric = bmetric;
if (isForward)
bestMetricIndex = indX-1 + PATH_METRIC_TABLE_LENGTH;
else
bestMetricIndex = indX+1 + PATH_METRIC_TABLE_LENGTH;
}
}
// record best path metric
pathMetricArray[indX] = bestMetric;
bestStateIndexArray[indR][indX] = bestMetricIndex;
// now Pr(Xb', Ib'=1 | Xb, Ib=0) : an insertion started: (walk right in path array)
bestMetric = INFINITE;
pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, indX, 1);
bmetric = previousPathMetricArray[indX] + logInsertionStartProbability + pBaseRead;
if (bmetric < bestMetric) { //if superfluous, could clean up
bestMetric = bmetric;
bestMetricIndex = indX;
}
// final case: Pr(Xb', Ib'=1 | Xb, Ib=1): insertion continues, also walk right in path array
if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) {
bmetric = previousPathMetricArray[indX+PATH_METRIC_TABLE_LENGTH] + logOneMinusInsertionEndProbability + pBaseRead;
if (bmetric < bestMetric) { //if superfluous, could clean up
bestMetric = bmetric;
bestMetricIndex = indX+PATH_METRIC_TABLE_LENGTH;
}
// record best path metric
pathMetricArray[indX+PATH_METRIC_TABLE_LENGTH] = bestMetric;
bestStateIndexArray[indR][indX+PATH_METRIC_TABLE_LENGTH] = bestMetricIndex;
}
}
private double getProbabilityOfReadBaseGivenXandI(byte haplotypeBase, byte readBase, byte readQual, int indX, int indI) {
if (indX == this.haplotypeSize || indX == 0) {
// X = L or R
double baseProb = QualityUtils.qualToProb(readQual);
return probToQual(baseProb);
}
if (haplotypeBase == readBase) {
double baseProb = QualityUtils.qualToProb(readQual);
return probToQual(baseProb);
}
else {
return (double)(readQual);
}
// TODO - optimize for speed by avoiding so many log conversions, can use a LUT
}
}

View File

@ -0,0 +1,259 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.CircularArray;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.PrintStream;
import java.util.*;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Window;
@Reference(window=@Window(start=-5,stop=50))
@Allows({DataSource.READS, DataSource.REFERENCE})
public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
@Output
PrintStream out;
@Argument(fullName="maxReadDeletionLength",shortName="maxReadDeletionLength",doc="Max deletion length allowed when aligning reads to candidate haplotypes.",required=false)
int maxReadDeletionLength = 3;
@Argument(fullName="insertionStartProbability",shortName="insertionStartProbability",doc="Assess only sites with coverage at or below the specified value.",required=false)
double insertionStartProbability = 0.01;
@Argument(fullName="insertionEndProbability",shortName="insertionEndProbability",doc="Assess only sites with coverage at or below the specified value.",required=false)
double insertionEndProbability = 0.5;
@Argument(fullName="alphaDeletionProbability",shortName="alphaDeletionProbability",doc="Assess only sites with coverage at or below the specified value.",required=false)
double alphaDeletionProbability = 0.01;
@Override
public boolean generateExtendedEvents() { return true; }
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
private HaplotypeIndelErrorModel model;
private static final int HAPLOTYPE_SIZE = 20;
private static final int MAX_READ_LENGTH = 200; // TODO- make this dynamic
@Override
public void initialize() {
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability,
insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH);
}
/* private void countIndels(ReadBackedExtendedEventPileup p) {
for ( ExtendedEventPileupElement pe : p.toExtendedIterable() ) {
if ( ! pe.isIndel() ) continue;
if ( pe.getEventLength() > MAX_LENGTH ) continue;
if ( pe.isInsertion() ) insCounts[pe.getEventLength()-1]++;
else delCounts[pe.getEventLength()-1]++;
}
}
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
if (!context.hasBasePileup())
return 0;
VariantContext vc = tracker.getVariantContext(ref, "indels", null, context.getLocation(), true);
// ignore places where we don't have a variant
if ( vc == null )
return 0;
if (!vc.isIndel())
return 0;
List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, HAPLOTYPE_SIZE);
// combine likelihoods for all possible haplotype pair (n*(n-1)/2 combinations)
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
double bestLikelihood = 1e13;
// for given allele haplotype, compute likelihood of read pileup given haplotype
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
int bestIndexI =0, bestIndexJ=0;
for (int i=0; i < haplotypesInVC.size(); i++) {
for (int j=i; j < haplotypesInVC.size(); j++){
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
for (SAMRecord read : pileup.getReads()) {
// compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj)
double readLikelihood[] = new double[2];
readLikelihood[0]= -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read)/10.0;
if (i != j)
readLikelihood[1] = -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read)/10.0;
else
readLikelihood[1] = readLikelihood[0];
// likelihood is by convention -10*log10(pr(r|h))
// sumlog10 computes 10^x[0]+10^x[1]+...
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
}
if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) {
bestIndexI = i;
bestIndexJ = j;
bestLikelihood = haplotypeLikehoodMatrix[i][j];
}
}
}
//we say that most likely genotype at site is index (i,j) maximizing likelihood matrix
String type;
if (vc.isDeletion())
type = "DEL";
else if (vc.isInsertion())
type = "INS";
else
type = "OTH";
type += vc.getIndelLengths().toString();
out.format("%s %d %s ",vc.getChr(), vc.getStart(), type);
Genotype originalGenotype = vc.getGenotype("NA12878");
String oldG, newG;
if (originalGenotype.isHomRef())
oldG = "HOMREF";
else if (originalGenotype.isHet())
oldG = "HET";
else if (originalGenotype.isHomVar())
oldG = "HOMVAR";
else
oldG = "OTHER";
int x = bestIndexI+bestIndexJ;
if (x == 0)
newG = "HOMREF";
else if (x == 1)
newG = "HET";
else if (x == 2)
newG = "HOMVAR";
else
newG = "OTHER";
out.format("NewG %s OldG %s\n", oldG, newG);
/*
if ( context.hasExtendedEventPileup() ) {
// if we got indels at current position:
ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getPileupWithoutMappingQualityZeroReads();
if ( pileup.size() < MIN_COVERAGE ) return 0;
if ( pileup.getNumberOfDeletions() + pileup.getNumberOfInsertions() > MAX_INDELS ) {
// we got too many indel events. Maybe it's even a true event, and what we are looking for are
// errors rather than true calls. Hence, we do not need these indels. We have to 1) discard
// all remaining indels from the buffer: if they are still in the buffer, they are too close
// to the current position; and 2) make sure that the next position at which we attempt to count again is
// sufficiently far *after* the current position.
// System.out.println("Non countable indel event at "+pileup.getLocation());
countableIndelBuffer.clear();
coverageBuffer.clear(); // we do not want to count observations (read bases) around non-countable indel as well
skipToLoc = GenomeLocParser.createGenomeLoc(pileup.getLocation().getContigIndex(),pileup.getLocation().getStop()+pileup.getMaxDeletionLength()+MIN_DISTANCE+1);
// System.out.println("Skip to "+skipToLoc);
} else {
// pileup does not contain too many indels, we need to store them in the buffer and count them later,
// if a non-countable indel event(s) do not show up too soon:
countableIndelBuffer.add(pileup);
}
return 0;
}
*/
return 1; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return value+sum; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void onTraversalDone(Integer result) {
}
}

View File

@ -24,11 +24,22 @@
package org.broadinstitute.sting.utils.genotype;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
public class Haplotype {
protected byte[] bases = null;
protected double[] quals = null;
private GenomeLoc genomeLocation = null;
private boolean isReference = false;
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
@ -56,6 +67,16 @@ public class Haplotype {
this(bases, 0);
}
public Haplotype(byte[] bases, GenomeLoc loc) {
this(bases);
this.genomeLocation = loc;
}
public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) {
this(bases, loc);
this.isReference = isRef;
}
public String toString() { return new String(this.bases); }
@ -73,4 +94,54 @@ public class Haplotype {
public byte[] getBasesAsBytes() {
return bases;
}
public long getStartPosition() {
return genomeLocation.getStart();
}
public boolean isReference() {
return isReference;
}
public static List<Haplotype> makeHaplotypeListFromVariantContextAlleles(VariantContext vc, ReferenceContext ref, final int haplotypeSize) {
List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
byte[] refBases = ref.getBases();
int numPrefBases = 5; // haplotypeSize/2;
int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart());
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases);
byte[] basesAfterVariant = Arrays.copyOfRange(refBases,
startIdxInReference+numPrefBases+vc.getReference().getBases().length, refBases.length);
// Create location for all haplotypes
long startLoc = ref.getWindow().getStart() + startIdxInReference;
long stopLoc = startLoc + haplotypeSize-1;
GenomeLoc locus = GenomeLocParser.createGenomeLoc(ref.getLocus().getContigIndex(),startLoc,
stopLoc);
for (Allele a : vc.getAlleles()) {
byte[] alleleBases = a.getBases();
// use string concatenation
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
haplotypeString = haplotypeString.substring(0,haplotypeSize);
haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
}
return haplotypeList;
}
}