diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java new file mode 100755 index 000000000..0890ac20c --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.jgrapht.graph.DefaultDirectedGraph; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 23, 2011 + */ + +// simple edge class for connecting nodes in the graph +public class DeBruijnEdge implements Comparable { + + private int multiplicity; + private boolean isRef; + + public DeBruijnEdge() { + multiplicity = 1; + isRef = false; + } + + public DeBruijnEdge( final boolean isRef ) { + multiplicity = 1; + this.isRef = isRef; + } + + public DeBruijnEdge( final boolean isRef, final int multiplicity ) { + this.multiplicity = multiplicity; + this.isRef = isRef; + } + + public int getMultiplicity() { + return multiplicity; + } + + public void setMultiplicity( final int value ) { + multiplicity = value; + } + + public boolean getIsRef() { + return isRef; + } + + public void setIsRef( final boolean isRef ) { + this.isRef = isRef; + } + + public boolean equals( final DefaultDirectedGraph graph, final DeBruijnEdge edge ) { + return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); + } + + public boolean equals( final DefaultDirectedGraph graph, final DeBruijnEdge edge, final DefaultDirectedGraph graph2 ) { + return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); + } + + @Override + public int compareTo( final DeBruijnEdge that ) { + return this.multiplicity - that.multiplicity; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java new file mode 100755 index 000000000..39833613d --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 23, 2011 + */ +// simple node class for storing kmer sequences +public class DeBruijnVertex { + + protected final byte[] sequence; + public final int kmer; + + public DeBruijnVertex( final byte[] sequence, final int kmer ) { + this.sequence = sequence; + this.kmer = kmer; + } + + @Override + public boolean equals( Object v ) { + return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence); + } + + @Override + public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + return Arrays.hashCode(sequence); + } + + public String toString() { + return new String(sequence); + } + + public String getSuffixString() { + return new String( getSuffix() ); + } + + public byte[] getSequence() { + return sequence; + } + + public byte[] getSuffix() { + return Arrays.copyOfRange( sequence, kmer - 1, sequence.length ); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java new file mode 100644 index 000000000..47a7030a8 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -0,0 +1,610 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public class GenotypingEngine { + + private final boolean DEBUG; + private final int MNP_LOOK_AHEAD; + private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; + private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied + private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + + public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + this.DEBUG = DEBUG; + this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD; + this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; + noCall.add(Allele.NO_CALL); + } + + // This function is the streamlined approach, currently not being used + @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) + public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, final byte[] ref, final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) { + // Prepare the list of haplotype indices to genotype + final ArrayList allelesToGenotype = new ArrayList(); + + for( final Haplotype h : haplotypes ) { + allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) ); + } + final int numHaplotypes = haplotypes.size(); + + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample + final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); + for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples + final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample); + int glIndex = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC + } + } + genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make()); + } + final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); + if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here + + // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF + final ArrayList haplotypesToRemove = new ArrayList(); + for( final Haplotype h : haplotypes ) { + if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF + haplotypesToRemove.add(h); + } + } + haplotypes.removeAll(haplotypesToRemove); + + if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + final List>>> returnVCs = new ArrayList>>>(); + // set up the default 1-to-1 haplotype mapping object + final HashMap> haplotypeMapping = new HashMap>(); + for( final Haplotype h : haplotypes ) { + final ArrayList list = new ArrayList(); + list.add(h); + haplotypeMapping.put(call.getAllele(h.getBases()), list); + } + returnVCs.add( new Pair>>(call,haplotypeMapping) ); + return returnVCs; + } + + final ArrayList>>> returnCalls = new ArrayList>>>(); + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + int count = 0; + if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } + for( final Haplotype h : haplotypes ) { + if( DEBUG ) { + System.out.println( h.toString() ); + System.out.println( "> Cigar = " + h.getCigar() ); + } + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) ); + startPosKeySet.addAll(h.getEventMap().keySet()); + } + + // Create the VC merge priority list + final ArrayList priorityList = new ArrayList(); + for( int iii = 0; iii < haplotypes.size(); iii++ ) { + priorityList.add("HC" + iii); + } + + // Walk along each position in the key set and create each event to be outputted + for( final int loc : startPosKeySet ) { + if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { + final ArrayList eventsAtThisLoc = new ArrayList(); + for( final Haplotype h : haplotypes ) { + final HashMap eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + + // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event + final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + + // Merge the event to find a common reference representation + final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + final HashMap> alleleHashMap = new HashMap>(); + int aCount = 0; + for( final Allele a : mergedVC.getAlleles() ) { + alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper + } + + if( DEBUG ) { + System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + //System.out.println("Event/haplotype allele mapping = " + alleleMapper); + } + + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample + final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); + for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples + final int myNumHaplotypes = alleleMapper.size(); + final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2]; + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); + int glIndex = 0; + for( int iii = 0; iii < myNumHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC + } + } + + // using the allele mapping object translate the haplotype allele into the event allele + final Genotype g = new GenotypeBuilder(sample) + .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes)) + .phased(loc != startPosKeySet.first()) + .PL(genotypeLikelihoods).make(); + myGenotypes.add(g); + } + returnCalls.add( new Pair>>( + new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) ); + } + } + return returnCalls; + } + + @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) + public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, final byte[] ref, final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, final ArrayList activeAllelesToGenotype ) { + + final ArrayList>>> returnCalls = new ArrayList>>>(); + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + int count = 0; + if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } + for( final Haplotype h : haplotypes ) { + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) ); + if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); } + if( DEBUG ) { + System.out.println( h.toString() ); + System.out.println( "> Cigar = " + h.getCigar() ); + System.out.println( "> Left and right breaks = (" + h.leftBreakPoint + " , " + h.rightBreakPoint + ")"); + System.out.println( ">> Events = " + h.getEventMap()); + } + } + // Create the VC merge priority list + final ArrayList priorityList = new ArrayList(); + for( int iii = 0; iii < haplotypes.size(); iii++ ) { + priorityList.add("HC" + iii); + } + + cleanUpSymbolicUnassembledEvents( haplotypes, priorityList ); + if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure + mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); + } + if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! + for( final VariantContext compVC : activeAllelesToGenotype ) { + startPosKeySet.add( compVC.getStart() ); + } + } + + + // Walk along each position in the key set and create each event to be outputted + for( final int loc : startPosKeySet ) { + if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { + final ArrayList eventsAtThisLoc = new ArrayList(); + if( activeAllelesToGenotype.isEmpty() ) { + for( final Haplotype h : haplotypes ) { + final HashMap eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + } else { // we are in GGA mode! + for( final VariantContext compVC : activeAllelesToGenotype ) { + if( compVC.getStart() == loc ) { + priorityList.clear(); + int alleleCount = 0; + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + HashSet alleleSet = new HashSet(2); + alleleSet.add(compVC.getReference()); + alleleSet.add(compAltAllele); + priorityList.add("Allele" + alleleCount); + eventsAtThisLoc.add(new VariantContextBuilder(compVC).alleles(alleleSet).source("Allele"+alleleCount).make()); + alleleCount++; + } + } + } + } + + if( eventsAtThisLoc.isEmpty() ) { continue; } + + // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event + final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + + // Merge the event to find a common reference representation + final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + if( mergedVC == null ) { continue; } + + final HashMap> alleleHashMap = new HashMap>(); + int aCount = 0; + for( final Allele a : mergedVC.getAlleles() ) { + alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper + } + + if( DEBUG ) { + System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + //System.out.println("Event/haplotype allele mapping = " + alleleMapper); + } + + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample + final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); + for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples + final int numHaplotypes = alleleMapper.size(); + final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); + int glIndex = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC + } + } + genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); + } + final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); + + if( call != null ) { + returnCalls.add( new Pair>>(call, alleleHashMap) ); + } + } + } + return returnCalls; + } + + protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes, final ArrayList priorityList ) { + final ArrayList haplotypesToRemove = new ArrayList(); + final ArrayList stringsToRemove = new ArrayList(); + for( final Haplotype h : haplotypes ) { + for( final VariantContext vc : h.getEventMap().values() ) { + if( vc.isSymbolic() ) { + for( final Haplotype h2 : haplotypes ) { + for( final VariantContext vc2 : h2.getEventMap().values() ) { + if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) { + haplotypesToRemove.add(h); + stringsToRemove.add(vc.getSource()); + break; + } + } + } + } + } + } + haplotypes.removeAll(haplotypesToRemove); + priorityList.removeAll(stringsToRemove); + } + + protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + final int MAX_SIZE_TO_COMBINE = 10; + final double MERGE_EVENTS_R2_THRESHOLD = 0.95; + if( startPosKeySet.size() <= 1 ) { return; } + + boolean mapWasUpdated = true; + while( mapWasUpdated ) { + mapWasUpdated = false; + + // loop over the set of start locations and consider pairs that start near each other + final Iterator iter = startPosKeySet.iterator(); + int thisStart = iter.next(); + while( iter.hasNext() ) { + final int nextStart = iter.next(); + if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) { + boolean isBiallelic = true; + VariantContext thisVC = null; + VariantContext nextVC = null; + int x11 = 0; + int x12 = 0; + int x21 = 0; + int x22 = 0; + + for( final Haplotype h : haplotypes ) { + // only make complex substitutions out of consecutive biallelic sites + final VariantContext thisHapVC = h.getEventMap().get(thisStart); + if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype + if( thisVC == null ) { + thisVC = thisHapVC; + } else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) { + isBiallelic = false; + break; + } + } + final VariantContext nextHapVC = h.getEventMap().get(nextStart); + if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype + if( nextVC == null ) { + nextVC = nextHapVC; + } else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) { + isBiallelic = false; + break; + } + } + // count up the co-occurrences of the events for the R^2 calculation + // BUGBUG: use haplotype likelihoods per-sample to make this more accurate + if( thisHapVC == null ) { + if( nextHapVC == null ) { x11++; } + else { x12++; } + } else { + if( nextHapVC == null ) { x21++; } + else { x22++; } + } + } + if( thisVC == null || nextVC == null ) { + continue; + //throw new ReviewedStingException("StartPos TreeSet has an entry for an event that is found on no haplotype. start pos = " + thisStart + ", next pos = " + nextStart); + } + if( isBiallelic ) { + final double R2 = calculateR2LD( x11, x12, x21, x22 ); + if( DEBUG ) { + System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); + System.out.println("-- " + thisVC); + System.out.println("-- " + nextVC); + } + if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { + + byte[] refBases = thisVC.getReference().getBases(); + byte[] altBases = thisVC.getAlternateAllele(0).getBases(); + final int thisPadding = ( thisVC.getReference().isNull() || thisVC.getAlternateAllele(0).isNull() ? 1 : 0 ); + final int nextPadding = ( nextVC.getReference().isNull() || nextVC.getAlternateAllele(0).isNull() ? 1 : 0 ); + for( int locus = thisStart + refBases.length + thisPadding ; locus < nextStart + nextPadding; locus++ ) { + final byte refByte = ref[ locus - refLoc.getStart() ]; + refBases = ArrayUtils.add(refBases, refByte); + altBases = ArrayUtils.add(altBases, refByte); + } + refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases()); + altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); + + final ArrayList mergedAlleles = new ArrayList(); + mergedAlleles.add( Allele.create( refBases, true ) ); + mergedAlleles.add( Allele.create( altBases, false ) ); + final VariantContext mergedVC = ( refBases.length == altBases.length ? + new VariantContextBuilder("MNP", thisVC.getChr(), thisVC.getStart() + (thisVC.isIndel() && !thisVC.isComplexIndel() ? 1 : 0) , nextVC.getEnd(), mergedAlleles).make() : + new VariantContextBuilder("Complex", thisVC.getChr(), thisVC.getStart(), nextVC.getEnd(), mergedAlleles).referenceBaseForIndel(ref[thisStart-refLoc.getStart()]).make() ); + + // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event + for( final Haplotype h : haplotypes ) { + final HashMap eventMap = h.getEventMap(); + if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) { + eventMap.remove(thisStart); + eventMap.remove(nextStart); + eventMap.put(mergedVC.getStart(), mergedVC); + } + } + startPosKeySet.add(mergedVC.getStart()); + boolean containsStart = false; + boolean containsNext = false; + for( final Haplotype h : haplotypes ) { + final HashMap eventMap = h.getEventMap(); + if( eventMap.containsKey(thisStart) ) { containsStart = true; } + if( eventMap.containsKey(nextStart) ) { containsNext = true; } + } + if(!containsStart) { startPosKeySet.remove(thisStart); } + if(!containsNext) { startPosKeySet.remove(nextStart); } + + if( DEBUG ) { System.out.println("====> " + mergedVC); } + mapWasUpdated = true; + break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events + } + } + } + thisStart = nextStart; + } + } + } + + @Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"}) + @Ensures({"result >= 0.0", "result <= 1.0"}) + protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) { + final int total = x11 + x12 + x21 + x22; + final double pa1b1 = ((double) x11) / ((double) total); + final double pa1b2 = ((double) x12) / ((double) total); + final double pa2b1 = ((double) x21) / ((double) total); + final double pa1 = pa1b1 + pa1b2; + final double pb1 = pa1b1 + pa2b1; + return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) ); + } + + @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) + @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) + protected static ArrayList> createAlleleMapper( final int loc, final ArrayList eventsAtThisLoc, final ArrayList haplotypes ) { + final ArrayList> alleleMapper = new ArrayList>(); + final ArrayList refList = new ArrayList(); + for( final Haplotype h : haplotypes ) { + if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype + refList.add(h); + } else { + boolean foundInEventList = false; + for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { + if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { + foundInEventList = true; + } + } + if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype + refList.add(h); + } + } + } + alleleMapper.add(refList); + for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { + final ArrayList list = new ArrayList(); + for( final Haplotype h : haplotypes ) { + if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { + list.add(h); + } + } + alleleMapper.add(list); + } + return alleleMapper; + } + + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) + protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { + if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } + final ArrayList eventAllelesForSample = new ArrayList(); + for( final Allele a : haplotypeAllelesForSample ) { + final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); + for( int iii = 0; iii < alleleMapper.size(); iii++ ) { + final ArrayList mappedHaplotypes = alleleMapper.get(iii); + if( mappedHaplotypes.contains(haplotype) ) { + eventAllelesForSample.add(eventAlleles.get(iii)); + break; + } + } + } + return eventAllelesForSample; + } + + protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { + for( final VariantContext vc : list ) { + if( vc.hasSameAllelesAs(vcToTest) ) { + return true; + } + } + return false; + } + + protected static HashMap generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) { + return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code + } + + protected static HashMap generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) { + final HashMap vcs = new HashMap(); + + int refPos = alignmentStartHapwrtRef; + if( refPos < 0 ) { return null; } // Protection against SW failures + int alignmentPos = 0; + + for( final CigarElement ce : cigar.getCigarElements() ) { + final int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case I: + final byte[] insertionBases = Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ); + boolean allN = true; + for( final byte b : insertionBases ) { + if( b != (byte) 'N' ) { + allN = false; + break; + } + } + if( !allN ) { + final ArrayList insertionAlleles = new ArrayList(); + final int insertionStart = refLoc.getStart() + refPos - 1; + if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { + insertionAlleles.add( Allele.create(ref[refPos-1], true) ); + insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); + vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); + } else { + insertionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, true) ); + insertionAlleles.add( Allele.create(insertionBases, false) ); + vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).referenceBaseForIndel(ref[refPos-1]).make()); + } + + } + alignmentPos += elementLength; + break; + case S: + alignmentPos += elementLength; + break; + case D: + final byte[] deletionBases = Arrays.copyOfRange( ref, refPos, refPos + elementLength ); + final ArrayList deletionAlleles = new ArrayList(); + final int deletionStart = refLoc.getStart() + refPos - 1; + // BUGBUG: how often does this symbolic deletion allele case happen? + //if( haplotype != null && ( (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) + // || (haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) ) ) { + // deletionAlleles.add( Allele.create(ref[refPos-1], true) ); + // deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); + // vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make()); + //} else { + deletionAlleles.add( Allele.create(deletionBases, true) ); + deletionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, false) ); + vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).referenceBaseForIndel(ref[refPos-1]).make()); + //} + refPos += elementLength; + break; + case M: + int numSinceMismatch = -1; + int stopOfMismatch = -1; + int startOfMismatch = -1; + int refPosStartOfMismatch = -1; + for( int iii = 0; iii < elementLength; iii++ ) { + if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) { + // SNP or start of possible MNP + if( stopOfMismatch == -1 ) { + startOfMismatch = alignmentPos; + stopOfMismatch = alignmentPos; + numSinceMismatch = 0; + refPosStartOfMismatch = refPos; + } else { + stopOfMismatch = alignmentPos; + } + } + if( stopOfMismatch != -1) { + numSinceMismatch++; + } + if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) { + final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 ); + final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 ); + final ArrayList snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refBases, true ) ); + snpAlleles.add( Allele.create( mismatchBases, false ) ); + final int snpStart = refLoc.getStart() + refPosStartOfMismatch; + vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make()); + numSinceMismatch = -1; + stopOfMismatch = -1; + startOfMismatch = -1; + refPosStartOfMismatch = -1; + } + refPos++; + alignmentPos++; + } + break; + case N: + case H: + case P: + default: + throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); + } + } + return vcs; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java new file mode 100755 index 000000000..bda94cc8f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. + * + *

Input

+ *

+ * Input bam file(s) from which to make calls + *

+ * + *

Output

+ *

+ * VCF file with raw, unrecalibrated SNP and indel calls. + *

+ * + *

Examples

+ *
+ *   java
+ *     -jar GenomeAnalysisTK.jar
+ *     -T HaplotypeCaller
+ *     -R reference/human_g1k_v37.fasta
+ *     -I input.bam
+ *     -o output.raw.snps.indels.vcf
+ * 
+ * + * @author rpoplin + * @since 8/22/11 + */ + +@PartitionBy(PartitionType.LOCUS) +@ActiveRegionExtension(extension=65, maxRegion=275) +public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatibleWalker { + + /** + * A raw, unfiltered, highly specific callset in VCF format. + */ + @Output(doc="File to which variants should be written", required = true) + protected VariantContextWriter vcfWriter = null; + + @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) + protected PrintStream graphWriter = null; + + @Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false) + protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN; + + @Argument(fullName="keepRG", shortName="keepRG", doc="keepRG", required = false) + protected String keepRG = null; + + @Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false) + protected int MNP_LOOK_AHEAD = 0; + + @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = true) + protected int MIN_PRUNE_FACTOR = 0; + + @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false) + protected boolean GENOTYPE_FULL_ACTIVE_REGION = false; + + @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false) + protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; + + @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="gcpHMM", required = false) + protected int gcpHMM = 10; + + @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage per sample to downsample each region to", required = false) + protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; + + @Argument(fullName="useExpandedTriggerSet", shortName="expandedTriggers", doc = "If specified, use additional, experimental triggers designed to capture larger indels but which may lead to an increase in the false positive rate", required=false) + protected boolean USE_EXPANDED_TRIGGER_SET = false; + + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) + protected boolean USE_ALLELES_TRIGGER = false; + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + /** + * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. + */ + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) + protected List annotationsToUse = new ArrayList(Arrays.asList(new String[]{"ClippingRankSumTest"})); + + /** + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"})); + + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ + @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) + protected String[] annotationClassesToUse = { "Standard" }; + + @ArgumentCollection + private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + + // the calculation arguments + private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; + + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) + protected boolean DEBUG; + + @Argument(fullName="doBanded", shortName="doBanded", doc="If specified, use the banded option", required = false) + protected boolean doBanded = false; + + // the assembly engine + LocalAssemblyEngine assemblyEngine = null; + + // the likelihoods engine + LikelihoodCalculationEngine likelihoodCalculationEngine = null; + + // the genotyping engine + GenotypingEngine genotypingEngine = null; + + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + // fasta reference reader to supplement the edges of the reference sequence + private IndexedFastaSequenceFile referenceReader; + + // reference base padding size + private static final int REFERENCE_PADDING = 900; + + // bases with quality less than or equal to this value are trimmed off the tails of the reads + private static final byte MIN_TAIL_QUALITY = 20; + + private ArrayList samplesList = new ArrayList(); + private final static double LOG_ONE_HALF = -Math.log10(2.0); + private final static double LOG_ONE_THIRD = -Math.log10(3.0); + private final ArrayList allelesToGenotype = new ArrayList(); + + private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + super.initialize(); + + // get all of the unique sample names + Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samplesList.addAll( samples ); + // initialize the UnifiedGenotyper Engine which is used to call into the exact model + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling + UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling + UAC.STANDARD_CONFIDENCE_FOR_CALLING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING) ); // low values used for isActive determination only, default/user-specified values used for actual calling + UAC.STANDARD_CONFIDENCE_FOR_EMITTING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING) ); // low values used for isActive determination only, default/user-specified values used for actual calling + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + + // initialize the output VCF header + annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + + Set headerInfo = new HashSet(); + + // all annotation fields from VariantAnnotatorEngine + headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); + // all callers need to add these standard annotation header lines + VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, + VCFConstants.DOWNSAMPLED_KEY, + VCFConstants.MLE_ALLELE_COUNT_KEY, + VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + // all callers need to add these standard FORMAT field header lines + VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true, + VCFConstants.GENOTYPE_KEY, + VCFConstants.GENOTYPE_QUALITY_KEY, + VCFConstants.DEPTH_KEY, + VCFConstants.GENOTYPE_PL_KEY); + // header lines for the experimental HaplotypeCaller-specific annotations + headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant")); + headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region")); + headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region")); + headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region")); + headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles")); + headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL")); + headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX")); + headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant")); + + vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); + + try { + // fasta reference reader to supplement the edges of the reference sequence + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); + } + + assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, doBanded ); + genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // isActive + // + //--------------------------------------------------------------------------------------------------------------- + + // enable deletions in the pileup + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + // enable non primary reads in the active region + @Override + public boolean wantsNonPrimaryReads() { return true; } + + @Override + @Ensures({"result >= 0.0", "result <= 1.0"}) + public double isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { + if( !allelesToGenotype.contains(vc) ) { + allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object + } + } + if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { + return 1.0; + } + } + + if( USE_ALLELES_TRIGGER ) { + return ( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + } + + if( context == null ) { return 0.0; } + + final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied + noCall.add(Allele.NO_CALL); + + final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); + final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); + for( final String sample : splitContexts.keySet() ) { + final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event) + Arrays.fill(genotypeLikelihoods, 0.0); + + for( final PileupElement p : splitContexts.get(sample).getBasePileup() ) { + final byte qual = ( USE_EXPANDED_TRIGGER_SET ? + ( p.isNextToSoftClip() || p.isBeforeInsertion() || p.isAfterInsertion() ? ( p.getQual() > QualityUtils.MIN_USABLE_Q_SCORE ? p.getQual() : (byte) 20 ) : p.getQual() ) + : p.getQual() ); + if( p.isDeletion() || qual > (USE_EXPANDED_TRIGGER_SET ? QualityUtils.MIN_USABLE_Q_SCORE : (byte) 18) ) { + int AA = 0; final int AB = 1; int BB = 2; + if( USE_EXPANDED_TRIGGER_SET ) { + if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() || + (!p.getRead().getNGSPlatform().equals(NGSPlatform.SOLID) && ((p.getRead().getReadPairedFlag() && p.getRead().getMateUnmappedFlag()) || BadMateFilter.hasBadMate(p.getRead()))) ) { + AA = 2; + BB = 0; + } + } else { + if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + AA = 2; + BB = 0; + } + } + genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual); + genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF ); + genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD; + } + } + genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); + } + + final ArrayList alleles = new ArrayList(); + alleles.add( FAKE_REF_ALLELE ); + alleles.add( FAKE_ALT_ALLELE ); + final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); + return ( vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ) ); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + + final ArrayList activeAllelesToGenotype = new ArrayList(); + + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + for( final VariantContext vc : allelesToGenotype ) { + if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { + activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode + } + } + allelesToGenotype.removeAll( activeAllelesToGenotype ); + } + + if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! + if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! + + finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region + referenceHaplotype.setIsReference(true); + final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); + //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); + final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! + + activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region + final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria + if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + + // evaluate each sample's reads against all haplotypes + final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); + final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); + likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList ); + + // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) + final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); + + for( final Pair>> callResult : + ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() ) + : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { + if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } + + final Map>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); + final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); + + // add some custom annotations to the calls + final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); + // Calculate the number of variants on the haplotype + int maxNumVar = 0; + for( final Allele allele : callResult.getFirst().getAlleles() ) { + if( !allele.isReference() ) { + for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { + final int numVar = haplotype.getEventMap().size(); + if( numVar > maxNumVar ) { maxNumVar = numVar; } + } + } + } + // Calculate the event length + int maxLength = 0; + for ( final Allele a : annotatedCall.getAlternateAlleles() ) { + final int length = a.length() - annotatedCall.getReference().length(); + if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } + } + + myAttributes.put("NVH", maxNumVar); + myAttributes.put("NumHapEval", bestHaplotypes.size()); + myAttributes.put("NumHapAssembly", haplotypes.size()); + myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); + myAttributes.put("EVENTLENGTH", maxLength); + myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); + myAttributes.put("extType", annotatedCall.getType().toString() ); + + //if( likelihoodCalculationEngine.haplotypeScore != null ) { + // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); + //} + if( annotatedCall.hasAttribute("QD") ) { + myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); + } + + vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); + } + + if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } + + return 1; // One active region was processed during this map call + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer cur, Integer sum) { + return cur + sum; + } + + @Override + public void onTraversalDone(Integer result) { + logger.info("Ran local assembly on " + result + " active regions"); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // private helper functions + // + //--------------------------------------------------------------------------------------------------------------- + + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { + if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getExtendedLoc() + " with " + activeRegion.size() + " reads:"); } + final ArrayList finalizedReadList = new ArrayList(); + final FragmentCollection fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) ); + activeRegion.clearReads(); + + // Join overlapping paired reads to create a single longer read + finalizedReadList.addAll( fragmentCollection.getSingletonReads() ); + for( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) { + finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) ); + } + + Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator()); + + // Loop through the reads hard clipping the adaptor and low quality tails + for( final GATKSAMRecord myRead : finalizedReadList ) { + final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); + if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + // protect against INTERVALS with abnormally high coverage + if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { + activeRegion.add(clippedRead); + } + } + } + } + + private List filterNonPassingReads( final ActiveRegion activeRegion ) { + final ArrayList readsToRemove = new ArrayList(); + for( final GATKSAMRecord rec : activeRegion.getReads() ) { + if( rec.getReadLength() < 24 || rec.getMappingQuality() <= 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + readsToRemove.add(rec); + } + } + activeRegion.removeAll( readsToRemove ); + return readsToRemove; + } + + private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { + final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1); + final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength()); + return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight); + } + + private HashMap> splitReadsBySample( final List reads ) { + final HashMap> returnMap = new HashMap>(); + for( final String sample : samplesList) { + ArrayList readList = returnMap.get( sample ); + if( readList == null ) { + readList = new ArrayList(); + returnMap.put(sample, readList); + } + } + for( final GATKSAMRecord read : reads ) { + returnMap.get(read.getReadGroup().getSample()).add(read); + } + + return returnMap; + } + + /* + private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) { + final ArrayList readLengthDistribution = new ArrayList(); + for( final GATKSAMRecord read : activeRegion.getReads() ) { + readLengthDistribution.add(read.getReadLength()); + } + final double meanReadLength = MathUtils.average(readLengthDistribution); + final double meanCoveragePerSample = (double) activeRegion.getReads().size() / ((double) activeRegion.getExtendedLoc().size() / meanReadLength) / (double) samplesList.size(); + int PRUNE_FACTOR = 0; + if( meanCoveragePerSample > 8.5 ) { + PRUNE_FACTOR = (int) Math.floor( Math.sqrt( meanCoveragePerSample - 5.0 ) ); + } else if( meanCoveragePerSample > 3.0 ) { + PRUNE_FACTOR = 1; + } + + if( DEBUG ) { System.out.println(String.format("Mean coverage per sample = %.1f --> prune factor = %d", meanCoveragePerSample, PRUNE_FACTOR)); } + return PRUNE_FACTOR; + } + */ +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java new file mode 100755 index 000000000..8d1ffa0a0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; + +import java.util.*; + +/** + * Haplotype-based resolution of variants in 2 different eval files. + * + *

+ * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. + * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. + * Records are annotated with the set and status attributes. + * + *

Input

+ *

+ * 2 variant files to resolve. + *

+ * + *

Output

+ *

+ * A single consensus VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx1g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T HaplotypeResolver \
+ *   -V:v1 input1.vcf \
+ *   -V:v2 input2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW)) +public class HaplotypeResolver extends RodWalker { + + protected static final String INTERSECTION_SET = "intersection"; + protected static final String SAME_STATUS = "same"; + protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch"; + protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles"; + protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype"; + protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther"; + protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents"; + + protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50; + protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000; + protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100; + protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER; + + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variants; + + @Output(doc="File to which variants should be written", required=true) + protected VariantContextWriter baseWriter = null; + private VariantContextWriter writer; + + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) + protected String SET_KEY = "set"; + + /** + * Set to 'null' if you don't want the status field emitted. + */ + @Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false) + protected String STATUS_KEY = "status"; + + private final LinkedList queue = new LinkedList(); + private String source1, source2; + private final List sourceVCs1 = new ArrayList(); + private final List sourceVCs2 = new ArrayList(); + + + private class VCcontext { + public final Collection vcs; + public final GenomeLoc loc; + public final ReferenceContext ref; + + public VCcontext(final Collection vcs, final ReferenceContext ref) { + this.vcs = vcs; + this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next()); + this.ref = ref; + } + } + + public void initialize() { + + if ( variants.size() != 2 ) { + throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files"); + } + source1 = variants.get(0).getName(); + source2 = variants.get(1).getName(); + + if ( SET_KEY.toLowerCase().equals("null") ) + SET_KEY = null; + if ( STATUS_KEY.toLowerCase().equals("null") ) + STATUS_KEY = null; + + // for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header) + Set headerLines = new HashSet(); + if ( SET_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record")); + if ( STATUS_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match")); + final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.emptySet()); + baseWriter.writeHeader(vcfHeader); + writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + final Collection VCs = tracker.getValues(variants, context.getLocation()); + if ( VCs.size() == 0 ) + return 0; + + final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref); + + // TODO -- what should we do about filtered records? + + if ( !queue.isEmpty() ) { + + final VCcontext previous = queue.getLast(); + if ( !previous.loc.onSameContig(vc.loc) || + previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS || + queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) { + purgeQueue(); + } + } + + queue.addLast(vc); + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + if ( !queue.isEmpty() ) + purgeQueue(); + writer.close(); + } + + private void purgeQueue() { + + final ReferenceContext refContext = queue.getFirst().ref; + + // divide them up by source + while ( !queue.isEmpty() ) { + VCcontext context = queue.removeFirst(); + for ( final VariantContext vc: context.vcs ) { + if ( vc.getSource().equals(source1) ) + sourceVCs1.add(vc); + else + sourceVCs2.add(vc); + } + } + + writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS); + + if ( sourceVCs1.isEmpty() ) { + writeAll(sourceVCs2, source2, null); + } else if ( sourceVCs2.isEmpty() ) { + writeAll(sourceVCs1, source1, null); + } else { + resolveByHaplotype(refContext); + } + + // allow for GC of the data + sourceVCs1.clear(); + sourceVCs2.clear(); + } + + private void writeAll(final List sourceVCs, final String set, final String status) { + for ( final VariantContext vc : sourceVCs ) { + writeOne(vc, set, status); + } + } + + private void writeOne(final VariantContext vc, final String set, final String status) { + final Map attrs = new HashMap(vc.getAttributes()); + if ( SET_KEY != null && set != null ) + attrs.put(SET_KEY, set); + if ( STATUS_KEY != null && status != null ) + attrs.put(STATUS_KEY, status); + writer.add(new VariantContextBuilder(vc).attributes(attrs).make()); + } + + private void writeAndPurgeAllEqualVariants(final List sourceVCs1, final List sourceVCs2, final String status) { + + int currentIndex1 = 0, currentIndex2 = 0; + int size1 = sourceVCs1.size(), size2 = sourceVCs2.size(); + VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + + while ( current1 != null && current2 != null ) { + + final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); + final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); + + if ( loc1.equals(loc2) || + (loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) { + // test the alleles + if ( determineAndWriteOverlap(current1, current2, status) ) { + sourceVCs1.remove(currentIndex1); + sourceVCs2.remove(currentIndex2); + size1--; + size2--; + } else { + currentIndex1++; + currentIndex2++; + } + current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + } else if ( loc1.isBefore(loc2) ) { + currentIndex1++; + current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + } else { + currentIndex2++; + current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + } + } + } + + private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) { + final int allelesFrom1In2 = findOverlap(vc1, vc2); + final int allelesFrom2In1 = findOverlap(vc2, vc1); + final int totalAllelesIn1 = vc1.getAlternateAlleles().size(); + final int totalAllelesIn2 = vc2.getAlternateAlleles().size(); + + final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1; + final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2; + + boolean thereIsOverlap = true; + + if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) { + writeOne(vc1, INTERSECTION_SET, status); + } else if ( allAllelesFrom1Overlap ) { + writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2); + } else if ( allAllelesFrom2Overlap ) { + writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1); + } else if ( allelesFrom1In2 > 0 ) { + writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS); + } else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction + writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS); + } else { + thereIsOverlap = false; + } + + return thereIsOverlap; + } + + private static int findOverlap(final VariantContext target, final VariantContext comparison) { + int overlap = 0; + for ( final Allele allele : target.getAlternateAlleles() ) { + if ( comparison.hasAlternateAllele(allele) ) + overlap++; + } + return overlap; + } + + private static final double SW_MATCH = 4.0; + private static final double SW_MISMATCH = -10.0; + private static final double SW_GAP = -25.0; + private static final double SW_GAP_EXTEND = -1.3; + private void resolveByHaplotype(final ReferenceContext refContext) { + + final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext); + final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext); + + final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + + // protect against SW failures + if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 || + swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) { + // TODO -- handle errors appropriately + logger.debug("Bad SW alignment; aborting at " + refContext.getLocus()); + return; + } + + // order results by start position + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0)); + if ( source1Map.size() == 0 || source2Map.size() == 0 ) { + // TODO -- handle errors appropriately + logger.debug("No source alleles; aborting at " + refContext.getLocus()); + return; + } + + // create lists and test for equality + final List source1Alleles = new ArrayList(source1Map.values()); + final List source2Alleles = new ArrayList(source2Map.values()); + + writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS); + if ( source1Alleles.isEmpty() ) { + writeAll(source2Alleles, source2, null); + } else if ( source2Alleles.isEmpty() ) { + writeAll(source1Alleles, source1, null); + } else { + writeDifferences(source1Alleles, source2Alleles); + } + } + + private byte[] generateHaplotype(final List sourceVCs, final ReferenceContext refContext) { + + final StringBuilder sb = new StringBuilder(); + + final int startPos = refContext.getWindow().getStart(); + int currentPos = startPos; + final byte[] reference = refContext.getBases(); + + for ( final VariantContext vc : sourceVCs ) { + // add any missing reference context + int vcStart = vc.getStart(); + final int refAlleleLength = vc.getReference().length(); + if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position) + vcStart++; + + while ( currentPos < vcStart ) + sb.append((char)reference[currentPos++ - startPos]); + + // add the alt allele + sb.append(vc.getAlternateAllele(0).getBaseString()); + + // skip the reference allele + currentPos += refAlleleLength; + } + // add any missing reference context + final int stopPos = refContext.getWindow().getStop(); + while ( currentPos < stopPos ) + sb.append((char)reference[currentPos++ - startPos]); + + return sb.toString().getBytes(); + } + + private void writeDifferences(final List source1Alleles, final List source2Alleles) { + int currentIndex1 = 0, currentIndex2 = 0; + final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); + VariantContext current1 = source1Alleles.get(0); + VariantContext current2 = source2Alleles.get(0); + + while ( currentIndex1 < size1 || currentIndex2 < size2 ) { + if ( current1 == null ) { + writeOne(current2, source2, null); + currentIndex2++; + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } else if ( current2 == null ) { + writeOne(current1, source1, null); + currentIndex1++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + } else { + + final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); + final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); + + if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) { + String status; + if ( loc1.getStart() == loc2.getStart() ) { + final String allele1 = current1.getAlternateAllele(0).getBaseString(); + final String allele2 = current2.getAlternateAllele(0).getBaseString(); + if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 ) + status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; + else + status = SAME_START_DIFFERENT_ALLELES_STATUS; + } else { + status = OVERLAPPING_EVENTS_STATUS; + } + + writeOne(current1, INTERSECTION_SET, status); + currentIndex1++; + currentIndex2++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } else if ( loc1.isBefore(loc2) ) { + writeOne(current1, source1, null); + currentIndex1++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + } else { + writeOne(current2, source2, null); + currentIndex2++; + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java new file mode 100755 index 000000000..45deb9b2a --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -0,0 +1,149 @@ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.jgrapht.graph.DefaultDirectedGraph; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 23, 2011 + */ +// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. +// This is different from most graph traversals because we want to test paths from any source node to any sink node. +public class KBestPaths { + + // static access only + protected KBestPaths() { } + private static int MAX_PATHS_TO_HOLD = 100; + + protected static class MyInt { public int val = 0; } + + // class to keep track of paths + protected static class Path { + + // the last vertex seen in the path + private DeBruijnVertex lastVertex; + + // the list of edges comprising the path + private ArrayList edges; + + // the scores for the path + private int totalScore = 0, lowestEdge = -1; + + public Path( final DeBruijnVertex initialVertex ) { + lastVertex = initialVertex; + edges = new ArrayList(0); + } + + public Path( final Path p, final DefaultDirectedGraph graph, final DeBruijnEdge edge ) { + lastVertex = graph.getEdgeTarget(edge); + edges = new ArrayList(p.edges); + edges.add(edge); + totalScore = p.totalScore + edge.getMultiplicity(); + lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity()); + } + + public boolean containsEdge( final DefaultDirectedGraph graph, final DeBruijnEdge edge ) { + final DeBruijnVertex targetVertex = graph.getEdgeTarget(edge); + for( final DeBruijnEdge e : edges ) { + if( e.equals(graph, edge) || graph.getEdgeTarget(e).equals(targetVertex) ) { + return true; + } + } + + return false; + } + + public ArrayList getEdges() { return edges; } + + public int getScore() { return totalScore; } + + public int getLowestEdge() { return lowestEdge; } + + public DeBruijnVertex getLastVertexInPath() { return lastVertex; } + + public byte[] getBases( final DefaultDirectedGraph graph ) { + if( edges.size() == 0 ) { return lastVertex.getSequence(); } + + byte[] bases = graph.getEdgeSource( edges.get(0) ).getSequence(); + for( final DeBruijnEdge e : edges ) { + bases = ArrayUtils.addAll(bases, graph.getEdgeTarget( e ).getSuffix()); + } + return bases; + } + } + + protected static class PathComparatorTotalScore implements Comparator { + public int compare(final Path path1, final Path path2) { + return path1.totalScore - path2.totalScore; + } + } + + protected static class PathComparatorLowestEdge implements Comparator { + public int compare(final Path path1, final Path path2) { + return path2.lowestEdge - path1.lowestEdge; + } + } + + public static List getKBestPaths( final DefaultDirectedGraph graph, final int k ) { + if( k > MAX_PATHS_TO_HOLD/2 ) { throw new ReviewedStingException("Asked for more paths than MAX_PATHS_TO_HOLD!"); } + final ArrayList bestPaths = new ArrayList(); + + // run a DFS for best paths + for( final DeBruijnVertex v : graph.vertexSet() ) { + if( graph.inDegreeOf(v) == 0 ) { + findBestPaths(graph, new Path(v), bestPaths); + } + } + + Collections.sort(bestPaths, new PathComparatorLowestEdge() ); + Collections.reverse(bestPaths); + return bestPaths.subList(0, Math.min(k, bestPaths.size())); + } + + private static void findBestPaths( final DefaultDirectedGraph graph, final Path path, final List bestPaths ) { + findBestPaths(graph, path, bestPaths, new MyInt()); + } + + private static void findBestPaths( final DefaultDirectedGraph graph, final Path path, final List bestPaths, MyInt n ) { + + // did we hit the end of a path? + if ( allOutgoingEdgesHaveBeenVisited(graph, path) ) { + if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { + // clean out some low scoring paths + Collections.sort(bestPaths, new PathComparatorLowestEdge() ); + for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } + } + bestPaths.add(path); + } else if( n.val > 10000) { + // do nothing, just return + } else { + // recursively run DFS + final ArrayList edgeArrayList = new ArrayList(); + edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex)); + Collections.sort(edgeArrayList); + Collections.reverse(edgeArrayList); + for ( final DeBruijnEdge edge : edgeArrayList ) { + // make sure the edge is not already in the path + if ( path.containsEdge(graph, edge) ) + continue; + + final Path newPath = new Path(path, graph, edge); + n.val++; + findBestPaths(graph, newPath, bestPaths, n); + } + } + } + + private static boolean allOutgoingEdgesHaveBeenVisited( final DefaultDirectedGraph graph, final Path path ) { + for( final DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) { + if( !path.containsEdge(graph, edge) ) { + return false; + } + } + return true; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java new file mode 100644 index 000000000..535508d09 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +public class LikelihoodCalculationEngine { + + private static final double LOG_ONE_HALF = -Math.log10(2.0); + private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1; + private final byte constantGCP; + private final boolean DEBUG; + private final PairHMM pairHMM; + + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { + pairHMM = new PairHMM( noBanded ); + this.constantGCP = constantGCP; + DEBUG = debug; + } + + public void computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { + final int numHaplotypes = haplotypes.size(); + + int X_METRIC_LENGTH = 0; + for( final String sample : perSampleReadList.keySet() ) { + for( final GATKSAMRecord read : perSampleReadList.get(sample) ) { + final int readLength = read.getReadLength(); + if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } + } + } + int Y_METRIC_LENGTH = 0; + for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { + final int haplotypeLength = haplotypes.get(jjj).getBases().length; + if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } + } + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + X_METRIC_LENGTH += 2; + Y_METRIC_LENGTH += 2; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + // for each sample's reads + for( final String sample : perSampleReadList.keySet() ) { + //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } + // evaluate the likelihood of the reads given those haplotypes + computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray ); + } + } + + private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + final int numHaplotypes = haplotypes.size(); + final int numReads = reads.size(); + final double[][] readLikelihoods = new double[numHaplotypes][numReads]; + for( int iii = 0; iii < numReads; iii++ ) { + final GATKSAMRecord read = reads.get(iii); + + final byte[] overallGCP = new byte[read.getReadLength()]; + Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + Haplotype previousHaplotypeSeen = null; + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { + readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality + //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated + //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated + readQuals[kkk] = ( readQuals[kkk] < (byte) 17 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + } + + for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { + final Haplotype haplotype = haplotypes.get(jjj); + final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); + previousHaplotypeSeen = haplotype; + + readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, + haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + } + } + for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { + haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] ); + } + } + + private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { + for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ + if( b1[iii] != b2[iii] ) { + return iii; + } + } + return b1.length; + } + + @Requires({"haplotypes.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final String sample ) { + // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization? + final ArrayList> haplotypeMapping = new ArrayList>(); + for( final Haplotype h : haplotypes ) { + final ArrayList list = new ArrayList(); + list.add(h); + haplotypeMapping.add(list); + } + return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping ); + } + + @Requires({"haplotypeMapping.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList> haplotypeMapping ) { + + final int numHaplotypes = haplotypeMapping.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { + final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); + for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { + final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); + double haplotypeLikelihood = 0.0; + for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF; + } + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? + } + } + } + } + + // normalize the diploid likelihoods matrix + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + } + + @Requires({"haplotypes.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final Set samples ) { + // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization? + final ArrayList> haplotypeMapping = new ArrayList>(); + for( final Haplotype h : haplotypes ) { + final ArrayList list = new ArrayList(); + list.add(h); + haplotypeMapping.add(list); + } + + final int numHaplotypes = haplotypeMapping.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { + for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); + final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); + for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF; + } + } + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? + } + } + } + } + + // normalize the diploid likelihoods matrix + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + } + + @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) + @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) + protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { + final int numHaplotypes = likelihoodMatrix.length; + double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; + int index = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ){ + genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj]; + } + } + genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); + index = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ){ + likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++]; + } + } + return likelihoodMatrix; + } + + /* + @Requires({"haplotypes.size() > 0"}) + @Ensures({"result.size() <= haplotypes.size()"}) + public ArrayList selectBestHaplotypes( final ArrayList haplotypes ) { + + // BUGBUG: This function needs a lot of work. Need to use 4-gamete test or Tajima's D to decide to break up events into separate pieces for genotyping + + final int numHaplotypes = haplotypes.size(); + final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples + final ArrayList bestHaplotypesIndexList = new ArrayList(); + bestHaplotypesIndexList.add(0); // always start with the reference haplotype + final double[][][] haplotypeLikelihoodMatrix = new double[sampleKeySet.size()][numHaplotypes][numHaplotypes]; + + int sampleCount = 0; + for( final String sample : sampleKeySet ) { + haplotypeLikelihoodMatrix[sampleCount++] = computeDiploidHaplotypeLikelihoods( haplotypes, sample ); + } + + int hap1 = 0; + int hap2 = 0; + int chosenSample = 0; + //double bestElement = Double.NEGATIVE_INFINITY; + final int maxChosenHaplotypes = Math.min( 15, sampleKeySet.size() * 2 + 1 ); + while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) { + double maxElement = Double.NEGATIVE_INFINITY; + for( int kkk = 0; kkk < sampleCount; kkk++ ) { + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + if( haplotypeLikelihoodMatrix[kkk][iii][jjj] > maxElement ) { + maxElement = haplotypeLikelihoodMatrix[kkk][iii][jjj]; + hap1 = iii; + hap2 = jjj; + chosenSample = kkk; + } + } + } + } + if( maxElement == Double.NEGATIVE_INFINITY ) { break; } + + if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } + if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } + + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + haplotypeLikelihoodMatrix[chosenSample][iii][jjj] = Double.NEGATIVE_INFINITY; + } + } + } + + if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + + final ArrayList bestHaplotypes = new ArrayList(); + for( final int hIndex : bestHaplotypesIndexList ) { + bestHaplotypes.add( haplotypes.get(hIndex) ); + } + return bestHaplotypes; + } + */ + + @Requires({"haplotypes.size() > 0"}) + @Ensures({"result.size() <= haplotypes.size()"}) + public ArrayList selectBestHaplotypes( final ArrayList haplotypes ) { + + final int numHaplotypes = haplotypes.size(); + final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples + final ArrayList bestHaplotypesIndexList = new ArrayList(); + bestHaplotypesIndexList.add(0); // always start with the reference haplotype + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( haplotypes, sampleKeySet ); // all samples pooled together + + int hap1 = 0; + int hap2 = 0; + //double bestElement = Double.NEGATIVE_INFINITY; + final int maxChosenHaplotypes = Math.min( 8, sampleKeySet.size() * 2 + 1 ); + while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) { + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) { + maxElement = haplotypeLikelihoodMatrix[iii][jjj]; + hap1 = iii; + hap2 = jjj; + } + } + } + if( maxElement == Double.NEGATIVE_INFINITY ) { break; } + if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } + haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; + + if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } + if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } + } + + if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + + final ArrayList bestHaplotypes = new ArrayList(); + for( final int hIndex : bestHaplotypesIndexList ) { + bestHaplotypes.add( haplotypes.get(hIndex) ); + } + return bestHaplotypes; + } + + public static Map>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { + final Map>> returnMap = new HashMap>>(); + final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); + for( final String sample : perSampleReadList.keySet() ) { + final Map> alleleReadMap = new HashMap>(); + final ArrayList readsForThisSample = perSampleReadList.get(sample); + for( int iii = 0; iii < readsForThisSample.size(); iii++ ) { + final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { + final double likelihoods[] = new double[call.getFirst().getAlleles().size()]; + int count = 0; + for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood + double maxLikelihood = Double.NEGATIVE_INFINITY; + for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) + final double likelihood = h.getReadLikelihoods(sample)[iii]; + if( likelihood > maxLikelihood ) { + maxLikelihood = likelihood; + } + } + likelihoods[count++] = maxLikelihood; + } + final int bestAllele = MathUtils.maxElementIndex(likelihoods); + final double bestLikelihood = likelihoods[bestAllele]; + Allele allele = Allele.NO_CALL; + boolean isInformativeRead = false; + for( final double likelihood : likelihoods ) { + if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) { + isInformativeRead = true; + break; + } + } + // uninformative reads get the no call Allele + if( isInformativeRead ) { + allele = call.getFirst().getAlleles().get(bestAllele); + } + List readList = alleleReadMap.get(allele); + if( readList == null ) { + readList = new ArrayList(); + alleleReadMap.put(allele, readList); + } + readList.add(read); + } + } + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods + List readList = alleleReadMap.get(Allele.NO_CALL); + if( readList == null ) { + readList = new ArrayList(); + alleleReadMap.put(Allele.NO_CALL, readList); + } + for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample) ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { + readList.add(read); + } + } + returnMap.put(sample, alleleReadMap); + } + return returnMap; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java new file mode 100755 index 000000000..bf6c82d82 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 14, 2011 + */ +public abstract class LocalAssemblyEngine { + + public enum ASSEMBLER { + SIMPLE_DE_BRUIJN + } + + protected LocalAssemblyEngine() { + } + + public abstract ArrayList runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, ArrayList activeAllelesToGenotype); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java new file mode 100755 index 000000000..e2bc7a10f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -0,0 +1,372 @@ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.jgrapht.graph.DefaultDirectedGraph; + +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: ebanks, rpoplin + * Date: Mar 14, 2011 + */ + +public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { + + private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers + private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; + private static final byte MIN_QUALITY = (byte) 17; + + // Smith-Waterman parameters originally copied from IndelRealigner + private static final double SW_MATCH = 5.0; // 1.0; + private static final double SW_MISMATCH = -10.0; //-1.0/3.0; + private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; + private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; + + private final boolean DEBUG; + private final PrintStream GRAPH_WRITER; + private final ArrayList> graphs = new ArrayList>(); + + private int PRUNE_FACTOR = 1; + + public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter ) { + super(); + DEBUG = debug; + GRAPH_WRITER = graphWriter; + } + + public ArrayList runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList activeAllelesToGenotype ) { + this.PRUNE_FACTOR = PRUNE_FACTOR; + + // create the graphs + createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); + + // clean up the graphs by pruning and merging + for( final DefaultDirectedGraph graph : graphs ) { + pruneGraph( graph, PRUNE_FACTOR ); + //eliminateNonRefPaths( graph ); + mergeNodes( graph ); + } + + if( GRAPH_WRITER != null ) { + printGraphs(); + } + + // find the best paths in the graphs + return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + } + + private void createDeBruijnGraphs( final ArrayList reads, final Haplotype refHaplotype ) { + graphs.clear(); + + // create the graph + for( int kmer = 31; kmer <= 75; kmer += 6 ) { + final DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); + if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) { + graphs.add(graph); + } + } + } + + protected static void mergeNodes( final DefaultDirectedGraph graph ) { + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; + for( final DeBruijnEdge e : graph.edgeSet() ) { + final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); + final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); + if( !outgoingVertex.equals(incomingVertex) && graph.inDegreeOf(outgoingVertex) == 1 && graph.outDegreeOf(incomingVertex) == 1) { + final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); + final Set inEdges = graph.incomingEdgesOf(incomingVertex); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + } else if( inEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } else if( outEdges.size() == 1 ) { + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } + + final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer ); + graph.addVertex(addedVertex); + for( final DeBruijnEdge edge : outEdges ) { + graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity())); + } + for( final DeBruijnEdge edge : inEdges ) { + graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity())); + } + + graph.removeVertex( incomingVertex ); + graph.removeVertex( outgoingVertex ); + foundNodesToMerge = true; + break; + } + } + } + } + + protected static void pruneGraph( final DefaultDirectedGraph graph, final int pruneFactor ) { + final ArrayList edgesToRemove = new ArrayList(); + for( final DeBruijnEdge e : graph.edgeSet() ) { + if( e.getMultiplicity() <= pruneFactor && !e.getIsRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor + edgesToRemove.add(e); + } + } + graph.removeAllEdges(edgesToRemove); + + // Run through the graph and clean up singular orphaned nodes + final ArrayList verticesToRemove = new ArrayList(); + for( final DeBruijnVertex v : graph.vertexSet() ) { + if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + graph.removeAllVertices(verticesToRemove); + } + + protected static void eliminateNonRefPaths( final DefaultDirectedGraph graph ) { + final ArrayList verticesToRemove = new ArrayList(); + boolean done = false; + while( !done ) { + done = true; + for( final DeBruijnVertex v : graph.vertexSet() ) { + if( graph.inDegreeOf(v) == 0 || graph.outDegreeOf(v) == 0 ) { + boolean isRefNode = false; + for( final DeBruijnEdge e : graph.edgesOf(v) ) { + if( e.getIsRef() ) { + isRefNode = true; + break; + } + } + if( !isRefNode ) { + done = false; + verticesToRemove.add(v); + } + } + } + graph.removeAllVertices(verticesToRemove); + verticesToRemove.clear(); + } + } + + private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final ArrayList reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { + final byte[] refSequence = refHaplotype.getBases(); + if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { + final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; + for (int i = 0; i < kmersInSequence - 1; i++) { + // get the kmers + final byte[] kmer1 = new byte[KMER_LENGTH]; + System.arraycopy(refSequence, i, kmer1, 0, KMER_LENGTH); + final byte[] kmer2 = new byte[KMER_LENGTH]; + System.arraycopy(refSequence, i+1, kmer2, 0, KMER_LENGTH); + if( !addKmersToGraph(graph, kmer1, kmer2, true) ) { + if( DEBUG ) { + System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); + } + return false; + } + } + } + + for( final GATKSAMRecord read : reads ) { + final byte[] sequence = read.getReadBases(); + final byte[] qualities = read.getBaseQualities(); + if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { + final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + // if the qualities of all the bases in the kmers are high enough + boolean badKmer = false; + for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { + if( qualities[jjj] < MIN_QUALITY ) { + badKmer = true; + break; + } + } + if( !badKmer ) { + // get the kmers + final byte[] kmer1 = new byte[KMER_LENGTH]; + System.arraycopy(sequence, iii, kmer1, 0, KMER_LENGTH); + final byte[] kmer2 = new byte[KMER_LENGTH]; + System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH); + + addKmersToGraph(graph, kmer1, kmer2, false); + } + } + } + } + return true; + } + + protected static boolean addKmersToGraph( final DefaultDirectedGraph graph, final byte[] kmer1, final byte[] kmer2, final boolean isRef ) { + + final int numVertexBefore = graph.vertexSet().size(); + final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length ); + graph.addVertex(v1); + final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length ); + graph.addVertex(v2); + if( isRef && graph.vertexSet().size() == numVertexBefore ) { return false; } + + final DeBruijnEdge targetEdge = graph.getEdge(v1, v2); + if ( targetEdge == null ) { + graph.addEdge(v1, v2, new DeBruijnEdge( isRef )); + } else { + if( isRef ) { + targetEdge.setIsRef( true ); + } + targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1); + } + return true; + } + + private void printGraphs() { + int count = 0; + for( final DefaultDirectedGraph graph : graphs ) { + GRAPH_WRITER.println("digraph kmer" + count++ +" {"); + for( final DeBruijnEdge edge : graph.edgeSet() ) { + if( edge.getMultiplicity() > PRUNE_FACTOR ) { + GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];"); + } + if( edge.getIsRef() ) { + GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); + } + if( !edge.getIsRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } + } + for( final DeBruijnVertex v : graph.vertexSet() ) { + final String label = ( graph.inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() ); + GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]"); + } + GRAPH_WRITER.println("}"); + } + } + + @Ensures({"result.contains(refHaplotype)"}) + private ArrayList findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final ArrayList activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { + final ArrayList returnHaplotypes = new ArrayList(); + + // add the reference haplotype separately from all the others + final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( fullReferenceWithPadding, refHaplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + refHaplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); + refHaplotype.setCigar( swConsensus.getCigar() ); + if( !returnHaplotypes.add( refHaplotype ) ) { + throw new ReviewedStingException("Unable to add reference haplotype during assembly: " + refHaplotype); + } + + final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); + + for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()); + if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { + return returnHaplotypes; + //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype); + } + } + } + + for( final DefaultDirectedGraph graph : graphs ) { + for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); + if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { + if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place + for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); + if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) { + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); + } + } + } + } + } + } + } + + if( DEBUG ) { + if( returnHaplotypes.size() > 1 ) { + System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against."); + } else { + System.out.println("Found only the reference haplotype in the assembly graph."); + } + for( final Haplotype h : returnHaplotypes ) { + System.out.println( h.toString() ); + System.out.println( "> Cigar = " + h.getCigar() ); + } + } + + return returnHaplotypes; + } + + private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop ) { + //final int sizeOfActiveRegion = activeRegionStop - activeRegionStart; + final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); + haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0) ); + + if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 ) { // protect against SW failures + return false; + } + + final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true ); + int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) { + hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal + } + byte[] newHaplotypeBases; + // extend partial haplotypes to contain the full active region sequence + int leftBreakPoint = 0; + int rightBreakPoint = 0; + if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), + haplotype.getBases()), + ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); + leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart; + rightBreakPoint = leftBreakPoint + haplotype.getBases().length; + //newHaplotypeBases = haplotype.getBases(); + //return false; // piece of haplotype isn't anchored within the active region so don't build a haplotype out of it + } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + //return false; + newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) ); + //newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), 0, hapStop); + leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart; + } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + //return false; + newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); + //newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length); + rightBreakPoint = haplotype.getBases().length - hapStart; + } else { + newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop); + } + + final Haplotype h = new Haplotype( newHaplotypeBases ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + + h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); + h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) ); + h.leftBreakPoint = leftBreakPoint; + h.rightBreakPoint = rightBreakPoint; + if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures + return false; + } + + if( !haplotypeList.contains(h) ) { + haplotypeList.add(h); + return true; + } else { + return false; + } + } +} \ No newline at end of file