Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-07-17 15:12:28 -04:00
commit b0d99fd10d
18 changed files with 3409 additions and 5 deletions

View File

@ -0,0 +1,60 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.jgrapht.graph.DefaultDirectedGraph;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 23, 2011
*/
// simple edge class for connecting nodes in the graph
public class DeBruijnEdge implements Comparable<DeBruijnEdge> {
private int multiplicity;
private boolean isRef;
public DeBruijnEdge() {
multiplicity = 1;
isRef = false;
}
public DeBruijnEdge( final boolean isRef ) {
multiplicity = 1;
this.isRef = isRef;
}
public DeBruijnEdge( final boolean isRef, final int multiplicity ) {
this.multiplicity = multiplicity;
this.isRef = isRef;
}
public int getMultiplicity() {
return multiplicity;
}
public void setMultiplicity( final int value ) {
multiplicity = value;
}
public boolean getIsRef() {
return isRef;
}
public void setIsRef( final boolean isRef ) {
this.isRef = isRef;
}
public boolean equals( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
}
public boolean equals( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge, final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph2 ) {
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
}
@Override
public int compareTo( final DeBruijnEdge that ) {
return this.multiplicity - that.multiplicity;
}
}

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 23, 2011
*/
// simple node class for storing kmer sequences
public class DeBruijnVertex {
protected final byte[] sequence;
public final int kmer;
public DeBruijnVertex( final byte[] sequence, final int kmer ) {
this.sequence = sequence;
this.kmer = kmer;
}
@Override
public boolean equals( Object v ) {
return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence);
}
@Override
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
return Arrays.hashCode(sequence);
}
public String toString() {
return new String(sequence);
}
public String getSuffixString() {
return new String( getSuffix() );
}
public byte[] getSequence() {
return sequence;
}
public byte[] getSuffix() {
return Arrays.copyOfRange( sequence, kmer - 1, sequence.length );
}
}

View File

@ -0,0 +1,610 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class GenotypingEngine {
private final boolean DEBUG;
private final int MNP_LOOK_AHEAD;
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
this.DEBUG = DEBUG;
this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD;
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
noCall.add(Allele.NO_CALL);
}
// This function is the streamlined approach, currently not being used
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) {
// Prepare the list of haplotype indices to genotype
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
for( final Haplotype h : haplotypes ) {
allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
}
final int numHaplotypes = haplotypes.size();
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
}
}
genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
}
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
// Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
haplotypesToRemove.add(h);
}
}
haplotypes.removeAll(haplotypesToRemove);
if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
// set up the default 1-to-1 haplotype mapping object
final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
list.add(h);
haplotypeMapping.put(call.getAllele(h.getBases()), list);
}
returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
return returnVCs;
}
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
int count = 0;
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
for( final Haplotype h : haplotypes ) {
if( DEBUG ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
}
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
startPosKeySet.addAll(h.getEventMap().keySet());
}
// Create the VC merge priority list
final ArrayList<String> priorityList = new ArrayList<String>();
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
priorityList.add("HC" + iii);
}
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc);
}
}
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
// Merge the event to find a common reference representation
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
}
if( DEBUG ) {
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
}
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final int myNumHaplotypes = alleleMapper.size();
final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
int glIndex = 0;
for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
}
}
// using the allele mapping object translate the haplotype allele into the event allele
final Genotype g = new GenotypeBuilder(sample)
.alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
.phased(loc != startPosKeySet.first())
.PL(genotypeLikelihoods).make();
myGenotypes.add(g);
}
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
}
}
return returnCalls;
}
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, final ArrayList<VariantContext> activeAllelesToGenotype ) {
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
int count = 0;
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( DEBUG ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
System.out.println( "> Left and right breaks = (" + h.leftBreakPoint + " , " + h.rightBreakPoint + ")");
System.out.println( ">> Events = " + h.getEventMap());
}
}
// Create the VC merge priority list
final ArrayList<String> priorityList = new ArrayList<String>();
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
priorityList.add("HC" + iii);
}
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
}
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}
}
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
if( activeAllelesToGenotype.isEmpty() ) {
for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc);
}
}
} else { // we are in GGA mode!
for( final VariantContext compVC : activeAllelesToGenotype ) {
if( compVC.getStart() == loc ) {
priorityList.clear();
int alleleCount = 0;
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
HashSet<Allele> alleleSet = new HashSet<Allele>(2);
alleleSet.add(compVC.getReference());
alleleSet.add(compAltAllele);
priorityList.add("Allele" + alleleCount);
eventsAtThisLoc.add(new VariantContextBuilder(compVC).alleles(alleleSet).source("Allele"+alleleCount).make());
alleleCount++;
}
}
}
}
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
// Merge the event to find a common reference representation
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
}
if( DEBUG ) {
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
}
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final int numHaplotypes = alleleMapper.size();
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
}
}
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call != null ) {
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
}
}
}
return returnCalls;
}
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) {
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
final ArrayList<String> stringsToRemove = new ArrayList<String>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().values() ) {
if( vc.isSymbolic() ) {
for( final Haplotype h2 : haplotypes ) {
for( final VariantContext vc2 : h2.getEventMap().values() ) {
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
haplotypesToRemove.add(h);
stringsToRemove.add(vc.getSource());
break;
}
}
}
}
}
}
haplotypes.removeAll(haplotypesToRemove);
priorityList.removeAll(stringsToRemove);
}
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
final int MAX_SIZE_TO_COMBINE = 10;
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
if( startPosKeySet.size() <= 1 ) { return; }
boolean mapWasUpdated = true;
while( mapWasUpdated ) {
mapWasUpdated = false;
// loop over the set of start locations and consider pairs that start near each other
final Iterator<Integer> iter = startPosKeySet.iterator();
int thisStart = iter.next();
while( iter.hasNext() ) {
final int nextStart = iter.next();
if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) {
boolean isBiallelic = true;
VariantContext thisVC = null;
VariantContext nextVC = null;
int x11 = 0;
int x12 = 0;
int x21 = 0;
int x22 = 0;
for( final Haplotype h : haplotypes ) {
// only make complex substitutions out of consecutive biallelic sites
final VariantContext thisHapVC = h.getEventMap().get(thisStart);
if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype
if( thisVC == null ) {
thisVC = thisHapVC;
} else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) {
isBiallelic = false;
break;
}
}
final VariantContext nextHapVC = h.getEventMap().get(nextStart);
if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype
if( nextVC == null ) {
nextVC = nextHapVC;
} else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) {
isBiallelic = false;
break;
}
}
// count up the co-occurrences of the events for the R^2 calculation
// BUGBUG: use haplotype likelihoods per-sample to make this more accurate
if( thisHapVC == null ) {
if( nextHapVC == null ) { x11++; }
else { x12++; }
} else {
if( nextHapVC == null ) { x21++; }
else { x22++; }
}
}
if( thisVC == null || nextVC == null ) {
continue;
//throw new ReviewedStingException("StartPos TreeSet has an entry for an event that is found on no haplotype. start pos = " + thisStart + ", next pos = " + nextStart);
}
if( isBiallelic ) {
final double R2 = calculateR2LD( x11, x12, x21, x22 );
if( DEBUG ) {
System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
System.out.println("-- " + thisVC);
System.out.println("-- " + nextVC);
}
if( R2 > MERGE_EVENTS_R2_THRESHOLD ) {
byte[] refBases = thisVC.getReference().getBases();
byte[] altBases = thisVC.getAlternateAllele(0).getBases();
final int thisPadding = ( thisVC.getReference().isNull() || thisVC.getAlternateAllele(0).isNull() ? 1 : 0 );
final int nextPadding = ( nextVC.getReference().isNull() || nextVC.getAlternateAllele(0).isNull() ? 1 : 0 );
for( int locus = thisStart + refBases.length + thisPadding ; locus < nextStart + nextPadding; locus++ ) {
final byte refByte = ref[ locus - refLoc.getStart() ];
refBases = ArrayUtils.add(refBases, refByte);
altBases = ArrayUtils.add(altBases, refByte);
}
refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases());
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
final ArrayList<Allele> mergedAlleles = new ArrayList<Allele>();
mergedAlleles.add( Allele.create( refBases, true ) );
mergedAlleles.add( Allele.create( altBases, false ) );
final VariantContext mergedVC = ( refBases.length == altBases.length ?
new VariantContextBuilder("MNP", thisVC.getChr(), thisVC.getStart() + (thisVC.isIndel() && !thisVC.isComplexIndel() ? 1 : 0) , nextVC.getEnd(), mergedAlleles).make() :
new VariantContextBuilder("Complex", thisVC.getChr(), thisVC.getStart(), nextVC.getEnd(), mergedAlleles).referenceBaseForIndel(ref[thisStart-refLoc.getStart()]).make() );
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
for( final Haplotype h : haplotypes ) {
final HashMap<Integer, VariantContext> eventMap = h.getEventMap();
if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) {
eventMap.remove(thisStart);
eventMap.remove(nextStart);
eventMap.put(mergedVC.getStart(), mergedVC);
}
}
startPosKeySet.add(mergedVC.getStart());
boolean containsStart = false;
boolean containsNext = false;
for( final Haplotype h : haplotypes ) {
final HashMap<Integer, VariantContext> eventMap = h.getEventMap();
if( eventMap.containsKey(thisStart) ) { containsStart = true; }
if( eventMap.containsKey(nextStart) ) { containsNext = true; }
}
if(!containsStart) { startPosKeySet.remove(thisStart); }
if(!containsNext) { startPosKeySet.remove(nextStart); }
if( DEBUG ) { System.out.println("====> " + mergedVC); }
mapWasUpdated = true;
break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
}
}
}
thisStart = nextStart;
}
}
}
@Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"})
@Ensures({"result >= 0.0", "result <= 1.0"})
protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) {
final int total = x11 + x12 + x21 + x22;
final double pa1b1 = ((double) x11) / ((double) total);
final double pa1b2 = ((double) x12) / ((double) total);
final double pa2b1 = ((double) x21) / ((double) total);
final double pa1 = pa1b1 + pa1b2;
final double pb1 = pa1b1 + pa2b1;
return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) );
}
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
refList.add(h);
} else {
boolean foundInEventList = false;
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
foundInEventList = true;
}
}
if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
refList.add(h);
}
}
}
alleleMapper.add(refList);
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
list.add(h);
}
}
alleleMapper.add(list);
}
return alleleMapper;
}
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
final ArrayList<Allele> eventAllelesForSample = new ArrayList<Allele>();
for( final Allele a : haplotypeAllelesForSample ) {
final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a));
for( int iii = 0; iii < alleleMapper.size(); iii++ ) {
final ArrayList<Haplotype> mappedHaplotypes = alleleMapper.get(iii);
if( mappedHaplotypes.contains(haplotype) ) {
eventAllelesForSample.add(eventAlleles.get(iii));
break;
}
}
}
return eventAllelesForSample;
}
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
for( final VariantContext vc : list ) {
if( vc.hasSameAllelesAs(vcToTest) ) {
return true;
}
}
return false;
}
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code
}
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
final HashMap<Integer,VariantContext> vcs = new HashMap<Integer,VariantContext>();
int refPos = alignmentStartHapwrtRef;
if( refPos < 0 ) { return null; } // Protection against SW failures
int alignmentPos = 0;
for( final CigarElement ce : cigar.getCigarElements() ) {
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
final byte[] insertionBases = Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength );
boolean allN = true;
for( final byte b : insertionBases ) {
if( b != (byte) 'N' ) {
allN = false;
break;
}
}
if( !allN ) {
final ArrayList<Allele> insertionAlleles = new ArrayList<Allele>();
final int insertionStart = refLoc.getStart() + refPos - 1;
if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
insertionAlleles.add( Allele.create(ref[refPos-1], true) );
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
} else {
insertionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, true) );
insertionAlleles.add( Allele.create(insertionBases, false) );
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
}
}
alignmentPos += elementLength;
break;
case S:
alignmentPos += elementLength;
break;
case D:
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos, refPos + elementLength );
final ArrayList<Allele> deletionAlleles = new ArrayList<Allele>();
final int deletionStart = refLoc.getStart() + refPos - 1;
// BUGBUG: how often does this symbolic deletion allele case happen?
//if( haplotype != null && ( (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength)
// || (haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) ) ) {
// deletionAlleles.add( Allele.create(ref[refPos-1], true) );
// deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
// vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make());
//} else {
deletionAlleles.add( Allele.create(deletionBases, true) );
deletionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, false) );
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
//}
refPos += elementLength;
break;
case M:
int numSinceMismatch = -1;
int stopOfMismatch = -1;
int startOfMismatch = -1;
int refPosStartOfMismatch = -1;
for( int iii = 0; iii < elementLength; iii++ ) {
if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) {
// SNP or start of possible MNP
if( stopOfMismatch == -1 ) {
startOfMismatch = alignmentPos;
stopOfMismatch = alignmentPos;
numSinceMismatch = 0;
refPosStartOfMismatch = refPos;
} else {
stopOfMismatch = alignmentPos;
}
}
if( stopOfMismatch != -1) {
numSinceMismatch++;
}
if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
snpAlleles.add( Allele.create( refBases, true ) );
snpAlleles.add( Allele.create( mismatchBases, false ) );
final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
numSinceMismatch = -1;
stopOfMismatch = -1;
startOfMismatch = -1;
refPosStartOfMismatch = -1;
}
refPos++;
alignmentPos++;
}
break;
case N:
case H:
case P:
default:
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
}
}
return vcs;
}
}

View File

@ -0,0 +1,558 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM.
*
* <h2>Input</h2>
* <p>
* Input bam file(s) from which to make calls
* </p>
*
* <h2>Output</h2>
* <p>
* VCF file with raw, unrecalibrated SNP and indel calls.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T HaplotypeCaller
* -R reference/human_g1k_v37.fasta
* -I input.bam
* -o output.raw.snps.indels.vcf
* </pre>
*
* @author rpoplin
* @since 8/22/11
*/
@PartitionBy(PartitionType.LOCUS)
@ActiveRegionExtension(extension=65, maxRegion=275)
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatibleWalker {
/**
* A raw, unfiltered, highly specific callset in VCF format.
*/
@Output(doc="File to which variants should be written", required = true)
protected VariantContextWriter vcfWriter = null;
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
protected PrintStream graphWriter = null;
@Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false)
protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN;
@Argument(fullName="keepRG", shortName="keepRG", doc="keepRG", required = false)
protected String keepRG = null;
@Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false)
protected int MNP_LOOK_AHEAD = 0;
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = true)
protected int MIN_PRUNE_FACTOR = 0;
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="gcpHMM", required = false)
protected int gcpHMM = 10;
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage per sample to downsample each region to", required = false)
protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
@Argument(fullName="useExpandedTriggerSet", shortName="expandedTriggers", doc = "If specified, use additional, experimental triggers designed to capture larger indels but which may lead to an increase in the false positive rate", required=false)
protected boolean USE_EXPANDED_TRIGGER_SET = false;
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
protected boolean USE_ALLELES_TRIGGER = false;
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
/**
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
* Records that are filtered in the comp track will be ignored.
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
*/
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
// The following are not used by the Unified Genotyper
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; }
/**
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
*/
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
/**
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
* so annotations will be excluded even if they are explicitly included with the other options.
*/
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"}));
/**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = { "Standard" };
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
private UnifiedGenotyperEngine UG_engine_simple_genotyper = null;
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
protected boolean DEBUG;
@Argument(fullName="doBanded", shortName="doBanded", doc="If specified, use the banded option", required = false)
protected boolean doBanded = false;
// the assembly engine
LocalAssemblyEngine assemblyEngine = null;
// the likelihoods engine
LikelihoodCalculationEngine likelihoodCalculationEngine = null;
// the genotyping engine
GenotypingEngine genotypingEngine = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// fasta reference reader to supplement the edges of the reference sequence
private IndexedFastaSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 900;
// bases with quality less than or equal to this value are trimmed off the tails of the reads
private static final byte MIN_TAIL_QUALITY = 20;
private ArrayList<String> samplesList = new ArrayList<String>();
private final static double LOG_ONE_HALF = -Math.log10(2.0);
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
private final ArrayList<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
private final static Allele FAKE_ALT_ALLELE = Allele.create("<FAKE_ALT>", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
super.initialize();
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
samplesList.addAll( samples );
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.STANDARD_CONFIDENCE_FOR_CALLING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
// all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// all callers need to add these standard annotation header lines
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
VCFConstants.DOWNSAMPLED_KEY,
VCFConstants.MLE_ALLELE_COUNT_KEY,
VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
// all callers need to add these standard FORMAT field header lines
VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true,
VCFConstants.GENOTYPE_KEY,
VCFConstants.GENOTYPE_QUALITY_KEY,
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);
// header lines for the experimental HaplotypeCaller-specific annotations
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
try {
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, doBanded );
genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
}
//---------------------------------------------------------------------------------------------------------------
//
// isActive
//
//---------------------------------------------------------------------------------------------------------------
// enable deletions in the pileup
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable non primary reads in the active region
@Override
public boolean wantsNonPrimaryReads() { return true; }
@Override
@Ensures({"result >= 0.0", "result <= 1.0"})
public double isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
if( !allelesToGenotype.contains(vc) ) {
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
}
}
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
return 1.0;
}
}
if( USE_ALLELES_TRIGGER ) {
return ( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
}
if( context == null ) { return 0.0; }
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
for( final String sample : splitContexts.keySet() ) {
final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
Arrays.fill(genotypeLikelihoods, 0.0);
for( final PileupElement p : splitContexts.get(sample).getBasePileup() ) {
final byte qual = ( USE_EXPANDED_TRIGGER_SET ?
( p.isNextToSoftClip() || p.isBeforeInsertion() || p.isAfterInsertion() ? ( p.getQual() > QualityUtils.MIN_USABLE_Q_SCORE ? p.getQual() : (byte) 20 ) : p.getQual() )
: p.getQual() );
if( p.isDeletion() || qual > (USE_EXPANDED_TRIGGER_SET ? QualityUtils.MIN_USABLE_Q_SCORE : (byte) 18) ) {
int AA = 0; final int AB = 1; int BB = 2;
if( USE_EXPANDED_TRIGGER_SET ) {
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ||
(!p.getRead().getNGSPlatform().equals(NGSPlatform.SOLID) && ((p.getRead().getReadPairedFlag() && p.getRead().getMateUnmappedFlag()) || BadMateFilter.hasBadMate(p.getRead()))) ) {
AA = 2;
BB = 0;
}
} else {
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
AA = 2;
BB = 0;
}
}
genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
}
}
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add( FAKE_REF_ALLELE );
alleles.add( FAKE_ALT_ALLELE );
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
return ( vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ) );
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
@Override
public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
final ArrayList<VariantContext> activeAllelesToGenotype = new ArrayList<VariantContext>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : allelesToGenotype ) {
if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode
}
}
allelesToGenotype.removeAll( activeAllelesToGenotype );
}
if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do!
if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do!
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region
referenceHaplotype.setIsReference(true);
final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
//int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype );
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
// evaluate each sample's reads against all haplotypes
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() )
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
// add some custom annotations to the calls
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
// Calculate the number of variants on the haplotype
int maxNumVar = 0;
for( final Allele allele : callResult.getFirst().getAlleles() ) {
if( !allele.isReference() ) {
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
final int numVar = haplotype.getEventMap().size();
if( numVar > maxNumVar ) { maxNumVar = numVar; }
}
}
}
// Calculate the event length
int maxLength = 0;
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
final int length = a.length() - annotatedCall.getReference().length();
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
}
myAttributes.put("NVH", maxNumVar);
myAttributes.put("NumHapEval", bestHaplotypes.size());
myAttributes.put("NumHapAssembly", haplotypes.size());
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
myAttributes.put("EVENTLENGTH", maxLength);
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
myAttributes.put("extType", annotatedCall.getType().toString() );
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
//}
if( annotatedCall.hasAttribute("QD") ) {
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
}
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
}
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
return 1; // One active region was processed during this map call
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer cur, Integer sum) {
return cur + sum;
}
@Override
public void onTraversalDone(Integer result) {
logger.info("Ran local assembly on " + result + " active regions");
}
//---------------------------------------------------------------------------------------------------------------
//
// private helper functions
//
//---------------------------------------------------------------------------------------------------------------
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getExtendedLoc() + " with " + activeRegion.size() + " reads:"); }
final ArrayList<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) );
activeRegion.clearReads();
// Join overlapping paired reads to create a single longer read
finalizedReadList.addAll( fragmentCollection.getSingletonReads() );
for( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
}
Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator());
// Loop through the reads hard clipping the adaptor and low quality tails
for( final GATKSAMRecord myRead : finalizedReadList ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// protect against INTERVALS with abnormally high coverage
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
activeRegion.add(clippedRead);
}
}
}
}
private List<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion ) {
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>();
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
if( rec.getReadLength() < 24 || rec.getMappingQuality() <= 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
readsToRemove.add(rec);
}
}
activeRegion.removeAll( readsToRemove );
return readsToRemove;
}
private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) {
final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1);
final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength());
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight);
}
private HashMap<String, ArrayList<GATKSAMRecord>> splitReadsBySample( final List<GATKSAMRecord> reads ) {
final HashMap<String, ArrayList<GATKSAMRecord>> returnMap = new HashMap<String, ArrayList<GATKSAMRecord>>();
for( final String sample : samplesList) {
ArrayList<GATKSAMRecord> readList = returnMap.get( sample );
if( readList == null ) {
readList = new ArrayList<GATKSAMRecord>();
returnMap.put(sample, readList);
}
}
for( final GATKSAMRecord read : reads ) {
returnMap.get(read.getReadGroup().getSample()).add(read);
}
return returnMap;
}
/*
private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) {
final ArrayList<Integer> readLengthDistribution = new ArrayList<Integer>();
for( final GATKSAMRecord read : activeRegion.getReads() ) {
readLengthDistribution.add(read.getReadLength());
}
final double meanReadLength = MathUtils.average(readLengthDistribution);
final double meanCoveragePerSample = (double) activeRegion.getReads().size() / ((double) activeRegion.getExtendedLoc().size() / meanReadLength) / (double) samplesList.size();
int PRUNE_FACTOR = 0;
if( meanCoveragePerSample > 8.5 ) {
PRUNE_FACTOR = (int) Math.floor( Math.sqrt( meanCoveragePerSample - 5.0 ) );
} else if( meanCoveragePerSample > 3.0 ) {
PRUNE_FACTOR = 1;
}
if( DEBUG ) { System.out.println(String.format("Mean coverage per sample = %.1f --> prune factor = %d", meanCoveragePerSample, PRUNE_FACTOR)); }
return PRUNE_FACTOR;
}
*/
}

View File

@ -0,0 +1,441 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.util.*;
/**
* Haplotype-based resolution of variants in 2 different eval files.
*
* <p>
* HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them.
* From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants.
* Records are annotated with the set and status attributes.
*
* <h2>Input</h2>
* <p>
* 2 variant files to resolve.
* </p>
*
* <h2>Output</h2>
* <p>
* A single consensus VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx1g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T HaplotypeResolver \
* -V:v1 input1.vcf \
* -V:v2 input2.vcf \
* -o output.vcf
* </pre>
*
*/
@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW))
public class HaplotypeResolver extends RodWalker<Integer, Integer> {
protected static final String INTERSECTION_SET = "intersection";
protected static final String SAME_STATUS = "same";
protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch";
protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles";
protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype";
protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther";
protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents";
protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50;
protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000;
protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100;
protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER;
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public List<RodBinding<VariantContext>> variants;
@Output(doc="File to which variants should be written", required=true)
protected VariantContextWriter baseWriter = null;
private VariantContextWriter writer;
/**
* Set to 'null' if you don't want the set field emitted.
*/
@Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
protected String SET_KEY = "set";
/**
* Set to 'null' if you don't want the status field emitted.
*/
@Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false)
protected String STATUS_KEY = "status";
private final LinkedList<VCcontext> queue = new LinkedList<VCcontext>();
private String source1, source2;
private final List<VariantContext> sourceVCs1 = new ArrayList<VariantContext>();
private final List<VariantContext> sourceVCs2 = new ArrayList<VariantContext>();
private class VCcontext {
public final Collection<VariantContext> vcs;
public final GenomeLoc loc;
public final ReferenceContext ref;
public VCcontext(final Collection<VariantContext> vcs, final ReferenceContext ref) {
this.vcs = vcs;
this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next());
this.ref = ref;
}
}
public void initialize() {
if ( variants.size() != 2 ) {
throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files");
}
source1 = variants.get(0).getName();
source2 = variants.get(1).getName();
if ( SET_KEY.toLowerCase().equals("null") )
SET_KEY = null;
if ( STATUS_KEY.toLowerCase().equals("null") )
STATUS_KEY = null;
// for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header)
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
if ( SET_KEY != null )
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record"));
if ( STATUS_KEY != null )
headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match"));
final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.<String>emptySet());
baseWriter.writeHeader(vcfHeader);
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
final Collection<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
if ( VCs.size() == 0 )
return 0;
final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref);
// TODO -- what should we do about filtered records?
if ( !queue.isEmpty() ) {
final VCcontext previous = queue.getLast();
if ( !previous.loc.onSameContig(vc.loc) ||
previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS ||
queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) {
purgeQueue();
}
}
queue.addLast(vc);
return 0;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
if ( !queue.isEmpty() )
purgeQueue();
writer.close();
}
private void purgeQueue() {
final ReferenceContext refContext = queue.getFirst().ref;
// divide them up by source
while ( !queue.isEmpty() ) {
VCcontext context = queue.removeFirst();
for ( final VariantContext vc: context.vcs ) {
if ( vc.getSource().equals(source1) )
sourceVCs1.add(vc);
else
sourceVCs2.add(vc);
}
}
writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS);
if ( sourceVCs1.isEmpty() ) {
writeAll(sourceVCs2, source2, null);
} else if ( sourceVCs2.isEmpty() ) {
writeAll(sourceVCs1, source1, null);
} else {
resolveByHaplotype(refContext);
}
// allow for GC of the data
sourceVCs1.clear();
sourceVCs2.clear();
}
private void writeAll(final List<VariantContext> sourceVCs, final String set, final String status) {
for ( final VariantContext vc : sourceVCs ) {
writeOne(vc, set, status);
}
}
private void writeOne(final VariantContext vc, final String set, final String status) {
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
if ( SET_KEY != null && set != null )
attrs.put(SET_KEY, set);
if ( STATUS_KEY != null && status != null )
attrs.put(STATUS_KEY, status);
writer.add(new VariantContextBuilder(vc).attributes(attrs).make());
}
private void writeAndPurgeAllEqualVariants(final List<VariantContext> sourceVCs1, final List<VariantContext> sourceVCs2, final String status) {
int currentIndex1 = 0, currentIndex2 = 0;
int size1 = sourceVCs1.size(), size2 = sourceVCs2.size();
VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
while ( current1 != null && current2 != null ) {
final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);
if ( loc1.equals(loc2) ||
(loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) {
// test the alleles
if ( determineAndWriteOverlap(current1, current2, status) ) {
sourceVCs1.remove(currentIndex1);
sourceVCs2.remove(currentIndex2);
size1--;
size2--;
} else {
currentIndex1++;
currentIndex2++;
}
current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
} else if ( loc1.isBefore(loc2) ) {
currentIndex1++;
current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
} else {
currentIndex2++;
current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
}
}
}
private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) {
final int allelesFrom1In2 = findOverlap(vc1, vc2);
final int allelesFrom2In1 = findOverlap(vc2, vc1);
final int totalAllelesIn1 = vc1.getAlternateAlleles().size();
final int totalAllelesIn2 = vc2.getAlternateAlleles().size();
final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1;
final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2;
boolean thereIsOverlap = true;
if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) {
writeOne(vc1, INTERSECTION_SET, status);
} else if ( allAllelesFrom1Overlap ) {
writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2);
} else if ( allAllelesFrom2Overlap ) {
writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1);
} else if ( allelesFrom1In2 > 0 ) {
writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS);
} else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction
writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS);
} else {
thereIsOverlap = false;
}
return thereIsOverlap;
}
private static int findOverlap(final VariantContext target, final VariantContext comparison) {
int overlap = 0;
for ( final Allele allele : target.getAlternateAlleles() ) {
if ( comparison.hasAlternateAllele(allele) )
overlap++;
}
return overlap;
}
private static final double SW_MATCH = 4.0;
private static final double SW_MISMATCH = -10.0;
private static final double SW_GAP = -25.0;
private static final double SW_GAP_EXTEND = -1.3;
private void resolveByHaplotype(final ReferenceContext refContext) {
final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext);
final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext);
final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
// protect against SW failures
if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 ||
swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) {
// TODO -- handle errors appropriately
logger.debug("Bad SW alignment; aborting at " + refContext.getLocus());
return;
}
// order results by start position
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0));
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0));
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
// TODO -- handle errors appropriately
logger.debug("No source alleles; aborting at " + refContext.getLocus());
return;
}
// create lists and test for equality
final List<VariantContext> source1Alleles = new ArrayList<VariantContext>(source1Map.values());
final List<VariantContext> source2Alleles = new ArrayList<VariantContext>(source2Map.values());
writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS);
if ( source1Alleles.isEmpty() ) {
writeAll(source2Alleles, source2, null);
} else if ( source2Alleles.isEmpty() ) {
writeAll(source1Alleles, source1, null);
} else {
writeDifferences(source1Alleles, source2Alleles);
}
}
private byte[] generateHaplotype(final List<VariantContext> sourceVCs, final ReferenceContext refContext) {
final StringBuilder sb = new StringBuilder();
final int startPos = refContext.getWindow().getStart();
int currentPos = startPos;
final byte[] reference = refContext.getBases();
for ( final VariantContext vc : sourceVCs ) {
// add any missing reference context
int vcStart = vc.getStart();
final int refAlleleLength = vc.getReference().length();
if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position)
vcStart++;
while ( currentPos < vcStart )
sb.append((char)reference[currentPos++ - startPos]);
// add the alt allele
sb.append(vc.getAlternateAllele(0).getBaseString());
// skip the reference allele
currentPos += refAlleleLength;
}
// add any missing reference context
final int stopPos = refContext.getWindow().getStop();
while ( currentPos < stopPos )
sb.append((char)reference[currentPos++ - startPos]);
return sb.toString().getBytes();
}
private void writeDifferences(final List<VariantContext> source1Alleles, final List<VariantContext> source2Alleles) {
int currentIndex1 = 0, currentIndex2 = 0;
final int size1 = source1Alleles.size(), size2 = source2Alleles.size();
VariantContext current1 = source1Alleles.get(0);
VariantContext current2 = source2Alleles.get(0);
while ( currentIndex1 < size1 || currentIndex2 < size2 ) {
if ( current1 == null ) {
writeOne(current2, source2, null);
currentIndex2++;
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
} else if ( current2 == null ) {
writeOne(current1, source1, null);
currentIndex1++;
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
} else {
final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);
if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) {
String status;
if ( loc1.getStart() == loc2.getStart() ) {
final String allele1 = current1.getAlternateAllele(0).getBaseString();
final String allele2 = current2.getAlternateAllele(0).getBaseString();
if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 )
status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS;
else
status = SAME_START_DIFFERENT_ALLELES_STATUS;
} else {
status = OVERLAPPING_EVENTS_STATUS;
}
writeOne(current1, INTERSECTION_SET, status);
currentIndex1++;
currentIndex2++;
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
} else if ( loc1.isBefore(loc2) ) {
writeOne(current1, source1, null);
currentIndex1++;
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
} else {
writeOne(current2, source2, null);
currentIndex2++;
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
}
}
}
}
}

View File

@ -0,0 +1,149 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.jgrapht.graph.DefaultDirectedGraph;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 23, 2011
*/
// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
// This is different from most graph traversals because we want to test paths from any source node to any sink node.
public class KBestPaths {
// static access only
protected KBestPaths() { }
private static int MAX_PATHS_TO_HOLD = 100;
protected static class MyInt { public int val = 0; }
// class to keep track of paths
protected static class Path {
// the last vertex seen in the path
private DeBruijnVertex lastVertex;
// the list of edges comprising the path
private ArrayList<DeBruijnEdge> edges;
// the scores for the path
private int totalScore = 0, lowestEdge = -1;
public Path( final DeBruijnVertex initialVertex ) {
lastVertex = initialVertex;
edges = new ArrayList<DeBruijnEdge>(0);
}
public Path( final Path p, final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
lastVertex = graph.getEdgeTarget(edge);
edges = new ArrayList<DeBruijnEdge>(p.edges);
edges.add(edge);
totalScore = p.totalScore + edge.getMultiplicity();
lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity());
}
public boolean containsEdge( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
final DeBruijnVertex targetVertex = graph.getEdgeTarget(edge);
for( final DeBruijnEdge e : edges ) {
if( e.equals(graph, edge) || graph.getEdgeTarget(e).equals(targetVertex) ) {
return true;
}
}
return false;
}
public ArrayList<DeBruijnEdge> getEdges() { return edges; }
public int getScore() { return totalScore; }
public int getLowestEdge() { return lowestEdge; }
public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
public byte[] getBases( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
if( edges.size() == 0 ) { return lastVertex.getSequence(); }
byte[] bases = graph.getEdgeSource( edges.get(0) ).getSequence();
for( final DeBruijnEdge e : edges ) {
bases = ArrayUtils.addAll(bases, graph.getEdgeTarget( e ).getSuffix());
}
return bases;
}
}
protected static class PathComparatorTotalScore implements Comparator<Path> {
public int compare(final Path path1, final Path path2) {
return path1.totalScore - path2.totalScore;
}
}
protected static class PathComparatorLowestEdge implements Comparator<Path> {
public int compare(final Path path1, final Path path2) {
return path2.lowestEdge - path1.lowestEdge;
}
}
public static List<Path> getKBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final int k ) {
if( k > MAX_PATHS_TO_HOLD/2 ) { throw new ReviewedStingException("Asked for more paths than MAX_PATHS_TO_HOLD!"); }
final ArrayList<Path> bestPaths = new ArrayList<Path>();
// run a DFS for best paths
for( final DeBruijnVertex v : graph.vertexSet() ) {
if( graph.inDegreeOf(v) == 0 ) {
findBestPaths(graph, new Path(v), bestPaths);
}
}
Collections.sort(bestPaths, new PathComparatorLowestEdge() );
Collections.reverse(bestPaths);
return bestPaths.subList(0, Math.min(k, bestPaths.size()));
}
private static void findBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path, final List<Path> bestPaths ) {
findBestPaths(graph, path, bestPaths, new MyInt());
}
private static void findBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path, final List<Path> bestPaths, MyInt n ) {
// did we hit the end of a path?
if ( allOutgoingEdgesHaveBeenVisited(graph, path) ) {
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
// clean out some low scoring paths
Collections.sort(bestPaths, new PathComparatorLowestEdge() );
for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); }
}
bestPaths.add(path);
} else if( n.val > 10000) {
// do nothing, just return
} else {
// recursively run DFS
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex));
Collections.sort(edgeArrayList);
Collections.reverse(edgeArrayList);
for ( final DeBruijnEdge edge : edgeArrayList ) {
// make sure the edge is not already in the path
if ( path.containsEdge(graph, edge) )
continue;
final Path newPath = new Path(path, graph, edge);
n.val++;
findBestPaths(graph, newPath, bestPaths, n);
}
}
}
private static boolean allOutgoingEdgesHaveBeenVisited( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path ) {
for( final DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) {
if( !path.containsEdge(graph, edge) ) {
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,403 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
public class LikelihoodCalculationEngine {
private static final double LOG_ONE_HALF = -Math.log10(2.0);
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
private final byte constantGCP;
private final boolean DEBUG;
private final PairHMM pairHMM;
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
pairHMM = new PairHMM( noBanded );
this.constantGCP = constantGCP;
DEBUG = debug;
}
public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
final int numHaplotypes = haplotypes.size();
int X_METRIC_LENGTH = 0;
for( final String sample : perSampleReadList.keySet() ) {
for( final GATKSAMRecord read : perSampleReadList.get(sample) ) {
final int readLength = read.getReadLength();
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
}
}
int Y_METRIC_LENGTH = 0;
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final int haplotypeLength = haplotypes.get(jjj).getBases().length;
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
}
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
X_METRIC_LENGTH += 2;
Y_METRIC_LENGTH += 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
// for each sample's reads
for( final String sample : perSampleReadList.keySet() ) {
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes
computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray );
}
}
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
final int numHaplotypes = haplotypes.size();
final int numReads = reads.size();
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
for( int iii = 0; iii < numReads; iii++ ) {
final GATKSAMRecord read = reads.get(iii);
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
Haplotype previousHaplotypeSeen = null;
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
readQuals[kkk] = ( readQuals[kkk] < (byte) 17 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype;
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP,
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
}
}
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] );
}
}
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
if( b1[iii] != b2[iii] ) {
return iii;
}
}
return b1.length;
}
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
list.add(h);
haplotypeMapping.add(list);
}
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
}
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
final int numHaplotypes = haplotypeMapping.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
for( int iii = 0; iii < numHaplotypes; iii++ ) {
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
}
// compute the diploid haplotype likelihoods
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
double haplotypeLikelihood = 0.0;
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
}
}
}
}
// normalize the diploid likelihoods matrix
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
}
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final Set<String> samples ) {
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
list.add(h);
haplotypeMapping.add(list);
}
final int numHaplotypes = haplotypeMapping.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
for( int iii = 0; iii < numHaplotypes; iii++ ) {
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
}
// compute the diploid haplotype likelihoods
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
double haplotypeLikelihood = 0.0;
for( final String sample : samples ) {
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
}
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
}
}
}
}
// normalize the diploid likelihoods matrix
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
}
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
final int numHaplotypes = likelihoodMatrix.length;
double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
int index = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ){
genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj];
}
}
genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
index = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ){
likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++];
}
}
return likelihoodMatrix;
}
/*
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.size() <= haplotypes.size()"})
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
// BUGBUG: This function needs a lot of work. Need to use 4-gamete test or Tajima's D to decide to break up events into separate pieces for genotyping
final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
final double[][][] haplotypeLikelihoodMatrix = new double[sampleKeySet.size()][numHaplotypes][numHaplotypes];
int sampleCount = 0;
for( final String sample : sampleKeySet ) {
haplotypeLikelihoodMatrix[sampleCount++] = computeDiploidHaplotypeLikelihoods( haplotypes, sample );
}
int hap1 = 0;
int hap2 = 0;
int chosenSample = 0;
//double bestElement = Double.NEGATIVE_INFINITY;
final int maxChosenHaplotypes = Math.min( 15, sampleKeySet.size() * 2 + 1 );
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
double maxElement = Double.NEGATIVE_INFINITY;
for( int kkk = 0; kkk < sampleCount; kkk++ ) {
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
if( haplotypeLikelihoodMatrix[kkk][iii][jjj] > maxElement ) {
maxElement = haplotypeLikelihoodMatrix[kkk][iii][jjj];
hap1 = iii;
hap2 = jjj;
chosenSample = kkk;
}
}
}
}
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
haplotypeLikelihoodMatrix[chosenSample][iii][jjj] = Double.NEGATIVE_INFINITY;
}
}
}
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
final ArrayList<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
for( final int hIndex : bestHaplotypesIndexList ) {
bestHaplotypes.add( haplotypes.get(hIndex) );
}
return bestHaplotypes;
}
*/
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.size() <= haplotypes.size()"})
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( haplotypes, sampleKeySet ); // all samples pooled together
int hap1 = 0;
int hap2 = 0;
//double bestElement = Double.NEGATIVE_INFINITY;
final int maxChosenHaplotypes = Math.min( 8, sampleKeySet.size() * 2 + 1 );
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
double maxElement = Double.NEGATIVE_INFINITY;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
maxElement = haplotypeLikelihoodMatrix[iii][jjj];
hap1 = iii;
hap2 = jjj;
}
}
}
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
}
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
final ArrayList<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
for( final int hIndex : bestHaplotypesIndexList ) {
bestHaplotypes.add( haplotypes.get(hIndex) );
}
return bestHaplotypes;
}
public static Map<String, Map<Allele, List<GATKSAMRecord>>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
final Map<String, Map<Allele, List<GATKSAMRecord>>> returnMap = new HashMap<String, Map<Allele, List<GATKSAMRecord>>>();
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
for( final String sample : perSampleReadList.keySet() ) {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
final ArrayList<GATKSAMRecord> readsForThisSample = perSampleReadList.get(sample);
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
final double likelihoods[] = new double[call.getFirst().getAlleles().size()];
int count = 0;
for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
double maxLikelihood = Double.NEGATIVE_INFINITY;
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
final double likelihood = h.getReadLikelihoods(sample)[iii];
if( likelihood > maxLikelihood ) {
maxLikelihood = likelihood;
}
}
likelihoods[count++] = maxLikelihood;
}
final int bestAllele = MathUtils.maxElementIndex(likelihoods);
final double bestLikelihood = likelihoods[bestAllele];
Allele allele = Allele.NO_CALL;
boolean isInformativeRead = false;
for( final double likelihood : likelihoods ) {
if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) {
isInformativeRead = true;
break;
}
}
// uninformative reads get the no call Allele
if( isInformativeRead ) {
allele = call.getFirst().getAlleles().get(bestAllele);
}
List<GATKSAMRecord> readList = alleleReadMap.get(allele);
if( readList == null ) {
readList = new ArrayList<GATKSAMRecord>();
alleleReadMap.put(allele, readList);
}
readList.add(read);
}
}
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
List<GATKSAMRecord> readList = alleleReadMap.get(Allele.NO_CALL);
if( readList == null ) {
readList = new ArrayList<GATKSAMRecord>();
alleleReadMap.put(Allele.NO_CALL, readList);
}
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
readList.add(read);
}
}
returnMap.put(sample, alleleReadMap);
}
return returnMap;
}
}

View File

@ -0,0 +1,25 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 14, 2011
*/
public abstract class LocalAssemblyEngine {
public enum ASSEMBLER {
SIMPLE_DE_BRUIJN
}
protected LocalAssemblyEngine() {
}
public abstract ArrayList<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, ArrayList<VariantContext> activeAllelesToGenotype);
}

View File

@ -0,0 +1,372 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.jgrapht.graph.DefaultDirectedGraph;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: ebanks, rpoplin
* Date: Mar 14, 2011
*/
public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
private static final byte MIN_QUALITY = (byte) 17;
// Smith-Waterman parameters originally copied from IndelRealigner
private static final double SW_MATCH = 5.0; // 1.0;
private static final double SW_MISMATCH = -10.0; //-1.0/3.0;
private static final double SW_GAP = -22.0; //-1.0-1.0/3.0;
private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
private final boolean DEBUG;
private final PrintStream GRAPH_WRITER;
private final ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>> graphs = new ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>>();
private int PRUNE_FACTOR = 1;
public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter ) {
super();
DEBUG = debug;
GRAPH_WRITER = graphWriter;
}
public ArrayList<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList<VariantContext> activeAllelesToGenotype ) {
this.PRUNE_FACTOR = PRUNE_FACTOR;
// create the graphs
createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
// clean up the graphs by pruning and merging
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
pruneGraph( graph, PRUNE_FACTOR );
//eliminateNonRefPaths( graph );
mergeNodes( graph );
}
if( GRAPH_WRITER != null ) {
printGraphs();
}
// find the best paths in the graphs
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
}
private void createDeBruijnGraphs( final ArrayList<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
graphs.clear();
// create the graph
for( int kmer = 31; kmer <= 75; kmer += 6 ) {
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) {
graphs.add(graph);
}
}
}
protected static void mergeNodes( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
boolean foundNodesToMerge = true;
while( foundNodesToMerge ) {
foundNodesToMerge = false;
for( final DeBruijnEdge e : graph.edgeSet() ) {
final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e);
final DeBruijnVertex incomingVertex = graph.getEdgeSource(e);
if( !outgoingVertex.equals(incomingVertex) && graph.inDegreeOf(outgoingVertex) == 1 && graph.outDegreeOf(incomingVertex) == 1) {
final Set<DeBruijnEdge> outEdges = graph.outgoingEdgesOf(outgoingVertex);
final Set<DeBruijnEdge> inEdges = graph.incomingEdgesOf(incomingVertex);
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
} else if( inEdges.size() == 1 ) {
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
} else if( outEdges.size() == 1 ) {
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
}
final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer );
graph.addVertex(addedVertex);
for( final DeBruijnEdge edge : outEdges ) {
graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity()));
}
for( final DeBruijnEdge edge : inEdges ) {
graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity()));
}
graph.removeVertex( incomingVertex );
graph.removeVertex( outgoingVertex );
foundNodesToMerge = true;
break;
}
}
}
}
protected static void pruneGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final int pruneFactor ) {
final ArrayList<DeBruijnEdge> edgesToRemove = new ArrayList<DeBruijnEdge>();
for( final DeBruijnEdge e : graph.edgeSet() ) {
if( e.getMultiplicity() <= pruneFactor && !e.getIsRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
graph.removeAllEdges(edgesToRemove);
// Run through the graph and clean up singular orphaned nodes
final ArrayList<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
for( final DeBruijnVertex v : graph.vertexSet() ) {
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
}
graph.removeAllVertices(verticesToRemove);
}
protected static void eliminateNonRefPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
final ArrayList<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
boolean done = false;
while( !done ) {
done = true;
for( final DeBruijnVertex v : graph.vertexSet() ) {
if( graph.inDegreeOf(v) == 0 || graph.outDegreeOf(v) == 0 ) {
boolean isRefNode = false;
for( final DeBruijnEdge e : graph.edgesOf(v) ) {
if( e.getIsRef() ) {
isRefNode = true;
break;
}
}
if( !isRefNode ) {
done = false;
verticesToRemove.add(v);
}
}
}
graph.removeAllVertices(verticesToRemove);
verticesToRemove.clear();
}
}
private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final ArrayList<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
final byte[] refSequence = refHaplotype.getBases();
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(refSequence, i, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(refSequence, i+1, kmer2, 0, KMER_LENGTH);
if( !addKmersToGraph(graph, kmer1, kmer2, true) ) {
if( DEBUG ) {
System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping");
}
return false;
}
}
}
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
// if the qualities of all the bases in the kmers are high enough
boolean badKmer = false;
for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) {
if( qualities[jjj] < MIN_QUALITY ) {
badKmer = true;
break;
}
}
if( !badKmer ) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(sequence, iii, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);
addKmersToGraph(graph, kmer1, kmer2, false);
}
}
}
}
return true;
}
protected static boolean addKmersToGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final byte[] kmer1, final byte[] kmer2, final boolean isRef ) {
final int numVertexBefore = graph.vertexSet().size();
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length );
graph.addVertex(v1);
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length );
graph.addVertex(v2);
if( isRef && graph.vertexSet().size() == numVertexBefore ) { return false; }
final DeBruijnEdge targetEdge = graph.getEdge(v1, v2);
if ( targetEdge == null ) {
graph.addEdge(v1, v2, new DeBruijnEdge( isRef ));
} else {
if( isRef ) {
targetEdge.setIsRef( true );
}
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1);
}
return true;
}
private void printGraphs() {
int count = 0;
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
GRAPH_WRITER.println("digraph kmer" + count++ +" {");
for( final DeBruijnEdge edge : graph.edgeSet() ) {
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
}
if( edge.getIsRef() ) {
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
}
if( !edge.getIsRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
}
for( final DeBruijnVertex v : graph.vertexSet() ) {
final String label = ( graph.inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() );
GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]");
}
GRAPH_WRITER.println("}");
}
}
@Ensures({"result.contains(refHaplotype)"})
private ArrayList<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final ArrayList<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
final ArrayList<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
// add the reference haplotype separately from all the others
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( fullReferenceWithPadding, refHaplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
refHaplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
refHaplotype.setCigar( swConsensus.getCigar() );
if( !returnHaplotypes.add( refHaplotype ) ) {
throw new ReviewedStingException("Unable to add reference haplotype during assembly: " + refHaplotype);
}
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
return returnHaplotypes;
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
}
}
}
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
}
}
}
}
}
}
}
if( DEBUG ) {
if( returnHaplotypes.size() > 1 ) {
System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
} else {
System.out.println("Found only the reference haplotype in the assembly graph.");
}
for( final Haplotype h : returnHaplotypes ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
}
}
return returnHaplotypes;
}
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop ) {
//final int sizeOfActiveRegion = activeRegionStop - activeRegionStart;
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0) );
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 ) { // protect against SW failures
return false;
}
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true );
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
}
byte[] newHaplotypeBases;
// extend partial haplotypes to contain the full active region sequence
int leftBreakPoint = 0;
int rightBreakPoint = 0;
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
haplotype.getBases()),
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart;
rightBreakPoint = leftBreakPoint + haplotype.getBases().length;
//newHaplotypeBases = haplotype.getBases();
//return false; // piece of haplotype isn't anchored within the active region so don't build a haplotype out of it
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
//return false;
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
//newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), 0, hapStop);
leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart;
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
//return false;
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
//newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length);
rightBreakPoint = haplotype.getBases().length - hapStart;
} else {
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
}
final Haplotype h = new Haplotype( newHaplotypeBases );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
h.leftBreakPoint = leftBreakPoint;
h.rightBreakPoint = rightBreakPoint;
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
return false;
}
if( !haplotypeList.contains(h) ) {
haplotypeList.add(h);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,271 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/15/12
*/
import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
* Unit tests for GenotypingEngine
*/
public class GenotypingEngineUnitTest extends BaseTest {
private static ReferenceSequenceFile seq;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
genomeLocParser = new GenomeLocParser(seq);
}
@Test
public void testFindHomVarEventAllelesInSample() {
final List<Allele> eventAlleles = new ArrayList<Allele>();
eventAlleles.add( Allele.create("A", true) );
eventAlleles.add( Allele.create("C", false) );
final List<Allele> haplotypeAlleles = new ArrayList<Allele>();
haplotypeAlleles.add( Allele.create("AATA", true) );
haplotypeAlleles.add( Allele.create("AACA", false) );
haplotypeAlleles.add( Allele.create("CATA", false) );
haplotypeAlleles.add( Allele.create("CACA", false) );
final ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
haplotypes.add(new Haplotype("AATA".getBytes()));
haplotypes.add(new Haplotype("AACA".getBytes()));
haplotypes.add(new Haplotype("CATA".getBytes()));
haplotypes.add(new Haplotype("CACA".getBytes()));
final List<Allele> haplotypeAllelesForSample = new ArrayList<Allele>();
haplotypeAllelesForSample.add( Allele.create("CATA", false) );
haplotypeAllelesForSample.add( Allele.create("CACA", false) );
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
ArrayList<Haplotype> Aallele = new ArrayList<Haplotype>();
Aallele.add(haplotypes.get(0));
Aallele.add(haplotypes.get(1));
ArrayList<Haplotype> Callele = new ArrayList<Haplotype>();
Callele.add(haplotypes.get(2));
Callele.add(haplotypes.get(3));
alleleMapper.add(Aallele);
alleleMapper.add(Callele);
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
eventAllelesForSample.add( Allele.create("C", false) );
eventAllelesForSample.add( Allele.create("C", false) );
if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) {
logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes));
logger.warn("expected alleles = " + eventAllelesForSample);
}
Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)));
}
@Test
public void testFindHetEventAllelesInSample() {
final List<Allele> eventAlleles = new ArrayList<Allele>();
eventAlleles.add( Allele.create("A", true) );
eventAlleles.add( Allele.create("C", false) );
eventAlleles.add( Allele.create("T", false) );
final List<Allele> haplotypeAlleles = new ArrayList<Allele>();
haplotypeAlleles.add( Allele.create("AATA", true) );
haplotypeAlleles.add( Allele.create("AACA", false) );
haplotypeAlleles.add( Allele.create("CATA", false) );
haplotypeAlleles.add( Allele.create("CACA", false) );
haplotypeAlleles.add( Allele.create("TACA", false) );
haplotypeAlleles.add( Allele.create("TTCA", false) );
haplotypeAlleles.add( Allele.create("TTTA", false) );
final ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
haplotypes.add(new Haplotype("AATA".getBytes()));
haplotypes.add(new Haplotype("AACA".getBytes()));
haplotypes.add(new Haplotype("CATA".getBytes()));
haplotypes.add(new Haplotype("CACA".getBytes()));
haplotypes.add(new Haplotype("TACA".getBytes()));
haplotypes.add(new Haplotype("TTCA".getBytes()));
haplotypes.add(new Haplotype("TTTA".getBytes()));
final List<Allele> haplotypeAllelesForSample = new ArrayList<Allele>();
haplotypeAllelesForSample.add( Allele.create("TTTA", false) );
haplotypeAllelesForSample.add( Allele.create("AATA", true) );
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
ArrayList<Haplotype> Aallele = new ArrayList<Haplotype>();
Aallele.add(haplotypes.get(0));
Aallele.add(haplotypes.get(1));
ArrayList<Haplotype> Callele = new ArrayList<Haplotype>();
Callele.add(haplotypes.get(2));
Callele.add(haplotypes.get(3));
ArrayList<Haplotype> Tallele = new ArrayList<Haplotype>();
Tallele.add(haplotypes.get(4));
Tallele.add(haplotypes.get(5));
Tallele.add(haplotypes.get(6));
alleleMapper.add(Aallele);
alleleMapper.add(Callele);
alleleMapper.add(Tallele);
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
eventAllelesForSample.add( Allele.create("A", true) );
eventAllelesForSample.add( Allele.create("T", false) );
if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) {
logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes));
logger.warn("expected alleles = " + eventAllelesForSample);
}
Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)));
}
private boolean compareAlleleLists(List<Allele> l1, List<Allele> l2) {
if( l1.size() != l2.size() ) {
return false; // sanity check
}
for( int i=0; i < l1.size(); i++ ){
if ( !l2.contains(l1.get(i)) )
return false;
}
return true;
}
private class BasicGenotypingTestProvider extends TestDataProvider {
byte[] ref;
byte[] hap;
HashMap<Integer,Byte> expected;
GenotypingEngine ge = new GenotypingEngine(false, 0, false);
public BasicGenotypingTestProvider(String refString, String hapString, HashMap<Integer, Byte> expected) {
super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString));
ref = refString.getBytes();
hap = hapString.getBytes();
this.expected = expected;
}
public HashMap<Integer,VariantContext> calcAlignment() {
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap);
return ge.generateVCsFromAlignment( alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name", 0);
}
}
@DataProvider(name = "BasicGenotypingTestProvider")
public Object[][] makeBasicGenotypingTests() {
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(1 + contextSize, (byte)'M');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACTAGCCGATAG", map);
}
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(2 + contextSize, (byte)'M');
map.put(21 + contextSize, (byte)'M');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG", "ATCTCGCATCGCGAGCATCGCCTAGCCGATAG", map);
}
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(1 + contextSize, (byte)'M');
map.put(20 + contextSize, (byte)'I');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACACTAGCCGATAG", map);
}
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(1 + contextSize, (byte)'M');
map.put(20 + contextSize, (byte)'D');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map);
}
for( int contextSize : new int[]{1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(1, (byte)'M');
map.put(20, (byte)'D');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider("AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map);
}
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(2 + contextSize, (byte)'M');
map.put(20 + contextSize, (byte)'I');
map.put(30 + contextSize, (byte)'D');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "ACCTCGCATCGCGAGCATCGTTACTAGCCGATG", map);
}
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
map.put(1 + contextSize, (byte)'M');
map.put(20 + contextSize, (byte)'D');
map.put(28 + contextSize, (byte)'M');
final String context = Utils.dupString('G', contextSize);
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCCATAG", map);
}
return BasicGenotypingTestProvider.getTests(BasicGenotypingTestProvider.class);
}
@Test(dataProvider = "BasicGenotypingTestProvider", enabled = true)
public void testHaplotypeToVCF(BasicGenotypingTestProvider cfg) {
HashMap<Integer,VariantContext> calculatedMap = cfg.calcAlignment();
HashMap<Integer,Byte> expectedMap = cfg.expected;
logger.warn(String.format("Test: %s", cfg.toString()));
if(!compareVCMaps(calculatedMap, expectedMap)) {
logger.warn("calc map = " + calculatedMap);
logger.warn("expected map = " + expectedMap);
}
Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap));
}
/**
* Tests that we get the right values from the binomial distribution
*/
@Test
public void testCalculateR2LD() {
logger.warn("Executing testCalculateR2LD");
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,1,1,1), 0.0, 0.00001);
Assert.assertEquals(GenotypingEngine.calculateR2LD(100,100,100,100), 0.0, 0.00001);
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,0,0,1), 1.0, 0.00001);
Assert.assertEquals(GenotypingEngine.calculateR2LD(100,0,0,100), 1.0, 0.00001);
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,2,3,4), (0.1 - 0.12) * (0.1 - 0.12) / (0.3 * 0.7 * 0.4 * 0.6), 0.00001);
}
/**
* Private function to compare HashMap of VCs, it only checks the types and start locations of the VariantContext
*/
private boolean compareVCMaps(HashMap<Integer, VariantContext> calc, HashMap<Integer, Byte> expected) {
if( !calc.keySet().equals(expected.keySet()) ) { return false; } // sanity check
for( Integer loc : expected.keySet() ) {
Byte type = expected.get(loc);
switch( type ) {
case 'I':
if( !calc.get(loc).isSimpleInsertion() ) { return false; }
break;
case 'D':
if( !calc.get(loc).isSimpleDeletion() ) { return false; }
break;
case 'M':
if( !(calc.get(loc).isMNP() || calc.get(loc).isSNP()) ) { return false; }
break;
default:
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,36 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String REF = b37KGReference;
final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
//final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr";
private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec);
}
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "7b4e76934e0c911220b4e7da8776ab2b");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "fcf0cea98a571d5e2d1dfa8b5edc599d");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "ff370c42c8b09a29f1aeff5ac57c7ea6");
}
}

View File

@ -0,0 +1,173 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/14/12
*/
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* Unit tests for LikelihoodCalculationEngine
*/
public class LikelihoodCalculationEngineUnitTest extends BaseTest {
@Test
public void testNormalizeDiploidLikelihoodMatrixFromLog10() {
double[][] likelihoodMatrix = {
{-90.2, 0, 0},
{-190.1, -2.1, 0},
{-7.0, -17.5, -35.9}
};
double[][] normalizedMatrix = {
{-88.1, 0, 0},
{-188.0, 0.0, 0},
{-4.9, -15.4, -33.8}
};
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix));
double[][] likelihoodMatrix2 = {
{-90.2, 0, 0, 0},
{-190.1, -2.1, 0, 0},
{-7.0, -17.5, -35.9, 0},
{-7.0, -17.5, -35.9, -1000.0},
};
double[][] normalizedMatrix2 = {
{-88.1, 0, 0, 0},
{-188.0, 0.0, 0, 0},
{-4.9, -15.4, -33.8, 0},
{-4.9, -15.4, -33.8, -997.9},
};
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
}
private class BasicLikelihoodTestProvider extends TestDataProvider {
public Double readLikelihoodForHaplotype1;
public Double readLikelihoodForHaplotype2;
public Double readLikelihoodForHaplotype3;
public BasicLikelihoodTestProvider(double a, double b) {
super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f",a,b));
readLikelihoodForHaplotype1 = a;
readLikelihoodForHaplotype2 = b;
readLikelihoodForHaplotype3 = null;
}
public BasicLikelihoodTestProvider(double a, double b, double c) {
super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f / %f",a,b,c));
readLikelihoodForHaplotype1 = a;
readLikelihoodForHaplotype2 = b;
readLikelihoodForHaplotype3 = c;
}
public double[][] expectedDiploidHaplotypeMatrix() {
if( readLikelihoodForHaplotype3 == null ) {
double maxValue = Math.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2);
double[][] normalizedMatrix = {
{readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY},
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue}
};
return normalizedMatrix;
} else {
double maxValue = MathUtils.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2,readLikelihoodForHaplotype3);
double[][] normalizedMatrix = {
{readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY},
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue, Double.NEGATIVE_INFINITY},
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue,
Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype2) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, readLikelihoodForHaplotype3 - maxValue}
};
return normalizedMatrix;
}
}
public double[][] calcDiploidHaplotypeMatrix() {
ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
for( int iii = 1; iii <= 3; iii++) {
Double readLikelihood = ( iii == 1 ? readLikelihoodForHaplotype1 : ( iii == 2 ? readLikelihoodForHaplotype2 : readLikelihoodForHaplotype3) );
if( readLikelihood != null ) {
Haplotype haplotype = new Haplotype( (iii == 1 ? "AAAA" : (iii == 2 ? "CCCC" : "TTTT")).getBytes() );
haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood});
haplotypes.add(haplotype);
}
}
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample");
}
}
@DataProvider(name = "BasicLikelihoodTestProvider")
public Object[][] makeBasicLikelihoodTests() {
new BasicLikelihoodTestProvider(-1.1, -2.2);
new BasicLikelihoodTestProvider(-2.2, -1.1);
new BasicLikelihoodTestProvider(-1.1, -1.1);
new BasicLikelihoodTestProvider(-9.7, -15.0);
new BasicLikelihoodTestProvider(-1.1, -2000.2);
new BasicLikelihoodTestProvider(-1000.1, -2.2);
new BasicLikelihoodTestProvider(0, 0);
new BasicLikelihoodTestProvider(-1.1, 0);
new BasicLikelihoodTestProvider(0, -2.2);
new BasicLikelihoodTestProvider(-100.1, -200.2);
new BasicLikelihoodTestProvider(-1.1, -2.2, 0);
new BasicLikelihoodTestProvider(-2.2, -1.1, 0);
new BasicLikelihoodTestProvider(-1.1, -1.1, 0);
new BasicLikelihoodTestProvider(-9.7, -15.0, 0);
new BasicLikelihoodTestProvider(-1.1, -2000.2, 0);
new BasicLikelihoodTestProvider(-1000.1, -2.2, 0);
new BasicLikelihoodTestProvider(0, 0, 0);
new BasicLikelihoodTestProvider(-1.1, 0, 0);
new BasicLikelihoodTestProvider(0, -2.2, 0);
new BasicLikelihoodTestProvider(-100.1, -200.2, 0);
new BasicLikelihoodTestProvider(-1.1, -2.2, -12.121);
new BasicLikelihoodTestProvider(-2.2, -1.1, -12.121);
new BasicLikelihoodTestProvider(-1.1, -1.1, -12.121);
new BasicLikelihoodTestProvider(-9.7, -15.0, -12.121);
new BasicLikelihoodTestProvider(-1.1, -2000.2, -12.121);
new BasicLikelihoodTestProvider(-1000.1, -2.2, -12.121);
new BasicLikelihoodTestProvider(0, 0, -12.121);
new BasicLikelihoodTestProvider(-1.1, 0, -12.121);
new BasicLikelihoodTestProvider(0, -2.2, -12.121);
new BasicLikelihoodTestProvider(-100.1, -200.2, -12.121);
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
}
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
public void testOneReadWithTwoOrThreeHaplotypes(BasicLikelihoodTestProvider cfg) {
double[][] calculatedMatrix = cfg.calcDiploidHaplotypeMatrix();
double[][] expectedMatrix = cfg.expectedDiploidHaplotypeMatrix();
logger.warn(String.format("Test: %s", cfg.toString()));
Assert.assertTrue(compareDoubleArrays(calculatedMatrix, expectedMatrix));
}
/**
* Private function to compare 2d arrays
*/
private boolean compareDoubleArrays(double[][] b1, double[][] b2) {
if( b1.length != b2.length ) {
return false; // sanity check
}
for( int i=0; i < b1.length; i++ ){
if( b1[i].length != b2[i].length) {
return false; // sanity check
}
for( int j=0; j < b1.length; j++ ){
if ( MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0 && !Double.isInfinite(b1[i][j]) && !Double.isInfinite(b2[i][j]))
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,257 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/27/12
*/
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
public byte[] sequence;
public int KMER_LENGTH;
public MergeNodesWithNoVariationTestProvider(String seq, int kmer) {
super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq));
sequence = seq.getBytes();
KMER_LENGTH = kmer;
}
public DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> expectedGraph() {
DeBruijnVertex v = new DeBruijnVertex(sequence, 0);
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
graph.addVertex(v);
return graph;
}
public DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> calcGraph() {
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH);
SimpleDeBruijnAssembler.addKmersToGraph(graph, kmer1, kmer2, false);
}
SimpleDeBruijnAssembler.mergeNodes(graph);
return graph;
}
}
@DataProvider(name = "MergeNodesWithNoVariationTestProvider")
public Object[][] makeMergeNodesWithNoVariationTests() {
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7);
new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class);
}
@Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true)
public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) {
logger.warn(String.format("Test: %s", cfg.toString()));
Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph()));
}
@Test(enabled = true)
public void testPruneGraph() {
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 0);
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 0);
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 0);
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 0);
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 0);
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 0);
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(false, 1));
graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
graph.addEdge(v5, v6, new DeBruijnEdge(false, 2));
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
SimpleDeBruijnAssembler.pruneGraph(graph, 2);
Assert.assertTrue(graphEquals(graph, expectedGraph));
graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(true, 1));
graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v, v2, new DeBruijnEdge(true, 1));
expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
SimpleDeBruijnAssembler.pruneGraph(graph, 2);
Assert.assertTrue(graphEquals(graph, expectedGraph));
}
@Test(enabled = true)
public void testEliminateNonRefPaths() {
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 0);
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 0);
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 0);
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 0);
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 0);
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 0);
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(false));
graph.addEdge(v2, v3, new DeBruijnEdge(true));
graph.addEdge(v3, v4, new DeBruijnEdge(true));
graph.addEdge(v4, v5, new DeBruijnEdge(true));
graph.addEdge(v5, v6, new DeBruijnEdge(false));
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v2, v3, new DeBruijnEdge());
expectedGraph.addEdge(v3, v4, new DeBruijnEdge());
expectedGraph.addEdge(v4, v5, new DeBruijnEdge());
SimpleDeBruijnAssembler.eliminateNonRefPaths(graph);
Assert.assertTrue(graphEquals(graph, expectedGraph));
graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(true));
graph.addEdge(v2, v3, new DeBruijnEdge(true));
graph.addEdge(v4, v5, new DeBruijnEdge(false));
graph.addEdge(v5, v6, new DeBruijnEdge(false));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addEdge(v, v2, new DeBruijnEdge());
expectedGraph.addEdge(v2, v3, new DeBruijnEdge());
SimpleDeBruijnAssembler.eliminateNonRefPaths(graph);
Assert.assertTrue(graphEquals(graph, expectedGraph));
graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(true));
graph.addEdge(v2, v3, new DeBruijnEdge(true));
graph.addEdge(v4, v5, new DeBruijnEdge(false));
graph.addEdge(v5, v6, new DeBruijnEdge(false));
graph.addEdge(v4, v2, new DeBruijnEdge(false));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addEdge(v, v2, new DeBruijnEdge());
expectedGraph.addEdge(v2, v3, new DeBruijnEdge());
SimpleDeBruijnAssembler.eliminateNonRefPaths(graph);
Assert.assertTrue(graphEquals(graph, expectedGraph));
}
private boolean graphEquals(DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> g1, DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> g2) {
if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) {
return false;
}
for( DeBruijnEdge e1 : g1.edgeSet() ) {
boolean found = false;
for( DeBruijnEdge e2 : g2.edgeSet() ) {
if( e1.equals(g1, e2, g2) ) { found = true; break; }
}
if( !found ) { return false; }
}
for( DeBruijnEdge e2 : g2.edgeSet() ) {
boolean found = false;
for( DeBruijnEdge e1 : g1.edgeSet() ) {
if( e2.equals(g2, e1, g1) ) { found = true; break; }
}
if( !found ) { return false; }
}
return true;
}
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
@ -15,7 +16,7 @@ import java.util.*;
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
* Note that the base quality rank sum test can not be calculated for homozygous sites.
*/
public class BaseQualityRankSumTest extends RankSumTest {
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); }

View File

@ -18,7 +18,7 @@ import java.util.*;
* Date: 6/28/12
*/
public class ClippingRankSumTest /*extends RankSumTest*/ {
public class ClippingRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("ClippingRankSum"); }

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
@ -16,7 +17,7 @@ import java.util.*;
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
* Note that the mapping quality rank sum test can not be calculated for homozygous sites.
*/
public class MappingQualityRankSumTest extends RankSumTest {
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {
public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }

View File

@ -28,7 +28,7 @@ import java.util.Map;
/**
* Abstract root for all RankSum based annotations
*/
public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final double INDEL_LIKELIHOOD_THRESH = 0.1;
static final boolean DEBUG = false;

View File

@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
@ -21,7 +22,7 @@ import java.util.*;
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
* Note that the read position rank sum test can not be calculated for homozygous sites.
*/
public class ReadPosRankSumTest extends RankSumTest {
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
public List<String> getKeyNames() {
return Arrays.asList("ReadPosRankSum");