Moving HaplotypeCaller from private to protected
This commit is contained in:
parent
6efbcd99f1
commit
c55934043e
|
|
@ -0,0 +1,60 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
|
||||
// simple edge class for connecting nodes in the graph
|
||||
public class DeBruijnEdge implements Comparable<DeBruijnEdge> {
|
||||
|
||||
private int multiplicity;
|
||||
private boolean isRef;
|
||||
|
||||
public DeBruijnEdge() {
|
||||
multiplicity = 1;
|
||||
isRef = false;
|
||||
}
|
||||
|
||||
public DeBruijnEdge( final boolean isRef ) {
|
||||
multiplicity = 1;
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public DeBruijnEdge( final boolean isRef, final int multiplicity ) {
|
||||
this.multiplicity = multiplicity;
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public int getMultiplicity() {
|
||||
return multiplicity;
|
||||
}
|
||||
|
||||
public void setMultiplicity( final int value ) {
|
||||
multiplicity = value;
|
||||
}
|
||||
|
||||
public boolean getIsRef() {
|
||||
return isRef;
|
||||
}
|
||||
|
||||
public void setIsRef( final boolean isRef ) {
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public boolean equals( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
|
||||
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
public boolean equals( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge, final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph2 ) {
|
||||
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo( final DeBruijnEdge that ) {
|
||||
return this.multiplicity - that.multiplicity;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
// simple node class for storing kmer sequences
|
||||
public class DeBruijnVertex {
|
||||
|
||||
protected final byte[] sequence;
|
||||
public final int kmer;
|
||||
|
||||
public DeBruijnVertex( final byte[] sequence, final int kmer ) {
|
||||
this.sequence = sequence;
|
||||
this.kmer = kmer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals( Object v ) {
|
||||
return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
|
||||
return Arrays.hashCode(sequence);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return new String(sequence);
|
||||
}
|
||||
|
||||
public String getSuffixString() {
|
||||
return new String( getSuffix() );
|
||||
}
|
||||
|
||||
public byte[] getSequence() {
|
||||
return sequence;
|
||||
}
|
||||
|
||||
public byte[] getSuffix() {
|
||||
return Arrays.copyOfRange( sequence, kmer - 1, sequence.length );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,610 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final int MNP_LOOK_AHEAD;
|
||||
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
this.DEBUG = DEBUG;
|
||||
this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD;
|
||||
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// This function is the streamlined approach, currently not being used
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) {
|
||||
// Prepare the list of haplotype indices to genotype
|
||||
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
|
||||
}
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
|
||||
|
||||
// Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
|
||||
haplotypesToRemove.add(h);
|
||||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
|
||||
if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.put(call.getAllele(h.getBases()), list);
|
||||
}
|
||||
returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
|
||||
return returnVCs;
|
||||
}
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
}
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
|
||||
startPosKeySet.addAll(h.getEventMap().keySet());
|
||||
}
|
||||
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int myNumHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
|
||||
// using the allele mapping object translate the haplotype allele into the event allele
|
||||
final Genotype g = new GenotypeBuilder(sample)
|
||||
.alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
|
||||
.phased(loc != startPosKeySet.first())
|
||||
.PL(genotypeLikelihoods).make();
|
||||
myGenotypes.add(g);
|
||||
}
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
|
||||
new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, final ArrayList<VariantContext> activeAllelesToGenotype ) {
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
|
||||
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
System.out.println( "> Left and right breaks = (" + h.leftBreakPoint + " , " + h.rightBreakPoint + ")");
|
||||
System.out.println( ">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
|
||||
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
if( compVC.getStart() == loc ) {
|
||||
priorityList.clear();
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
HashSet<Allele> alleleSet = new HashSet<Allele>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
priorityList.add("Allele" + alleleCount);
|
||||
eventsAtThisLoc.add(new VariantContextBuilder(compVC).alleles(alleleSet).source("Allele"+alleleCount).make());
|
||||
alleleCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int numHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
|
||||
if( call != null ) {
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
final ArrayList<String> stringsToRemove = new ArrayList<String>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
for( final Haplotype h2 : haplotypes ) {
|
||||
for( final VariantContext vc2 : h2.getEventMap().values() ) {
|
||||
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
|
||||
haplotypesToRemove.add(h);
|
||||
stringsToRemove.add(vc.getSource());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
priorityList.removeAll(stringsToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
final int MAX_SIZE_TO_COMBINE = 10;
|
||||
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
|
||||
if( startPosKeySet.size() <= 1 ) { return; }
|
||||
|
||||
boolean mapWasUpdated = true;
|
||||
while( mapWasUpdated ) {
|
||||
mapWasUpdated = false;
|
||||
|
||||
// loop over the set of start locations and consider pairs that start near each other
|
||||
final Iterator<Integer> iter = startPosKeySet.iterator();
|
||||
int thisStart = iter.next();
|
||||
while( iter.hasNext() ) {
|
||||
final int nextStart = iter.next();
|
||||
if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) {
|
||||
boolean isBiallelic = true;
|
||||
VariantContext thisVC = null;
|
||||
VariantContext nextVC = null;
|
||||
int x11 = 0;
|
||||
int x12 = 0;
|
||||
int x21 = 0;
|
||||
int x22 = 0;
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// only make complex substitutions out of consecutive biallelic sites
|
||||
final VariantContext thisHapVC = h.getEventMap().get(thisStart);
|
||||
if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype
|
||||
if( thisVC == null ) {
|
||||
thisVC = thisHapVC;
|
||||
} else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) {
|
||||
isBiallelic = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
final VariantContext nextHapVC = h.getEventMap().get(nextStart);
|
||||
if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype
|
||||
if( nextVC == null ) {
|
||||
nextVC = nextHapVC;
|
||||
} else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) {
|
||||
isBiallelic = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// count up the co-occurrences of the events for the R^2 calculation
|
||||
// BUGBUG: use haplotype likelihoods per-sample to make this more accurate
|
||||
if( thisHapVC == null ) {
|
||||
if( nextHapVC == null ) { x11++; }
|
||||
else { x12++; }
|
||||
} else {
|
||||
if( nextHapVC == null ) { x21++; }
|
||||
else { x22++; }
|
||||
}
|
||||
}
|
||||
if( thisVC == null || nextVC == null ) {
|
||||
continue;
|
||||
//throw new ReviewedStingException("StartPos TreeSet has an entry for an event that is found on no haplotype. start pos = " + thisStart + ", next pos = " + nextStart);
|
||||
}
|
||||
if( isBiallelic ) {
|
||||
final double R2 = calculateR2LD( x11, x12, x21, x22 );
|
||||
if( DEBUG ) {
|
||||
System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
|
||||
System.out.println("-- " + thisVC);
|
||||
System.out.println("-- " + nextVC);
|
||||
}
|
||||
if( R2 > MERGE_EVENTS_R2_THRESHOLD ) {
|
||||
|
||||
byte[] refBases = thisVC.getReference().getBases();
|
||||
byte[] altBases = thisVC.getAlternateAllele(0).getBases();
|
||||
final int thisPadding = ( thisVC.getReference().isNull() || thisVC.getAlternateAllele(0).isNull() ? 1 : 0 );
|
||||
final int nextPadding = ( nextVC.getReference().isNull() || nextVC.getAlternateAllele(0).isNull() ? 1 : 0 );
|
||||
for( int locus = thisStart + refBases.length + thisPadding ; locus < nextStart + nextPadding; locus++ ) {
|
||||
final byte refByte = ref[ locus - refLoc.getStart() ];
|
||||
refBases = ArrayUtils.add(refBases, refByte);
|
||||
altBases = ArrayUtils.add(altBases, refByte);
|
||||
}
|
||||
refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases());
|
||||
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
|
||||
|
||||
final ArrayList<Allele> mergedAlleles = new ArrayList<Allele>();
|
||||
mergedAlleles.add( Allele.create( refBases, true ) );
|
||||
mergedAlleles.add( Allele.create( altBases, false ) );
|
||||
final VariantContext mergedVC = ( refBases.length == altBases.length ?
|
||||
new VariantContextBuilder("MNP", thisVC.getChr(), thisVC.getStart() + (thisVC.isIndel() && !thisVC.isComplexIndel() ? 1 : 0) , nextVC.getEnd(), mergedAlleles).make() :
|
||||
new VariantContextBuilder("Complex", thisVC.getChr(), thisVC.getStart(), nextVC.getEnd(), mergedAlleles).referenceBaseForIndel(ref[thisStart-refLoc.getStart()]).make() );
|
||||
|
||||
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) {
|
||||
eventMap.remove(thisStart);
|
||||
eventMap.remove(nextStart);
|
||||
eventMap.put(mergedVC.getStart(), mergedVC);
|
||||
}
|
||||
}
|
||||
startPosKeySet.add(mergedVC.getStart());
|
||||
boolean containsStart = false;
|
||||
boolean containsNext = false;
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) ) { containsStart = true; }
|
||||
if( eventMap.containsKey(nextStart) ) { containsNext = true; }
|
||||
}
|
||||
if(!containsStart) { startPosKeySet.remove(thisStart); }
|
||||
if(!containsNext) { startPosKeySet.remove(nextStart); }
|
||||
|
||||
if( DEBUG ) { System.out.println("====> " + mergedVC); }
|
||||
mapWasUpdated = true;
|
||||
break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
|
||||
}
|
||||
}
|
||||
}
|
||||
thisStart = nextStart;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"})
|
||||
@Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) {
|
||||
final int total = x11 + x12 + x21 + x22;
|
||||
final double pa1b1 = ((double) x11) / ((double) total);
|
||||
final double pa1b2 = ((double) x12) / ((double) total);
|
||||
final double pa2b1 = ((double) x21) / ((double) total);
|
||||
final double pa1 = pa1b1 + pa1b2;
|
||||
final double pb1 = pa1b1 + pa2b1;
|
||||
return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) );
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
|
||||
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
|
||||
protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
|
||||
final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
} else {
|
||||
boolean foundInEventList = false;
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
foundInEventList = true;
|
||||
}
|
||||
}
|
||||
if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
}
|
||||
}
|
||||
}
|
||||
alleleMapper.add(refList);
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
list.add(h);
|
||||
}
|
||||
}
|
||||
alleleMapper.add(list);
|
||||
}
|
||||
return alleleMapper;
|
||||
}
|
||||
|
||||
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
|
||||
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
|
||||
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
|
||||
final ArrayList<Allele> eventAllelesForSample = new ArrayList<Allele>();
|
||||
for( final Allele a : haplotypeAllelesForSample ) {
|
||||
final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a));
|
||||
for( int iii = 0; iii < alleleMapper.size(); iii++ ) {
|
||||
final ArrayList<Haplotype> mappedHaplotypes = alleleMapper.get(iii);
|
||||
if( mappedHaplotypes.contains(haplotype) ) {
|
||||
eventAllelesForSample.add(eventAlleles.get(iii));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return eventAllelesForSample;
|
||||
}
|
||||
|
||||
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
|
||||
for( final VariantContext vc : list ) {
|
||||
if( vc.hasSameAllelesAs(vcToTest) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
|
||||
return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code
|
||||
}
|
||||
|
||||
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
|
||||
final HashMap<Integer,VariantContext> vcs = new HashMap<Integer,VariantContext>();
|
||||
|
||||
int refPos = alignmentStartHapwrtRef;
|
||||
if( refPos < 0 ) { return null; } // Protection against SW failures
|
||||
int alignmentPos = 0;
|
||||
|
||||
for( final CigarElement ce : cigar.getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
final byte[] insertionBases = Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength );
|
||||
boolean allN = true;
|
||||
for( final byte b : insertionBases ) {
|
||||
if( b != (byte) 'N' ) {
|
||||
allN = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !allN ) {
|
||||
final ArrayList<Allele> insertionAlleles = new ArrayList<Allele>();
|
||||
final int insertionStart = refLoc.getStart() + refPos - 1;
|
||||
if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
|
||||
insertionAlleles.add( Allele.create(ref[refPos-1], true) );
|
||||
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
} else {
|
||||
insertionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, true) );
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
|
||||
}
|
||||
|
||||
}
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
case S:
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
case D:
|
||||
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos, refPos + elementLength );
|
||||
final ArrayList<Allele> deletionAlleles = new ArrayList<Allele>();
|
||||
final int deletionStart = refLoc.getStart() + refPos - 1;
|
||||
// BUGBUG: how often does this symbolic deletion allele case happen?
|
||||
//if( haplotype != null && ( (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength)
|
||||
// || (haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) ) ) {
|
||||
// deletionAlleles.add( Allele.create(ref[refPos-1], true) );
|
||||
// deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
// vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make());
|
||||
//} else {
|
||||
deletionAlleles.add( Allele.create(deletionBases, true) );
|
||||
deletionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, false) );
|
||||
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
|
||||
//}
|
||||
refPos += elementLength;
|
||||
break;
|
||||
case M:
|
||||
int numSinceMismatch = -1;
|
||||
int stopOfMismatch = -1;
|
||||
int startOfMismatch = -1;
|
||||
int refPosStartOfMismatch = -1;
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) {
|
||||
// SNP or start of possible MNP
|
||||
if( stopOfMismatch == -1 ) {
|
||||
startOfMismatch = alignmentPos;
|
||||
stopOfMismatch = alignmentPos;
|
||||
numSinceMismatch = 0;
|
||||
refPosStartOfMismatch = refPos;
|
||||
} else {
|
||||
stopOfMismatch = alignmentPos;
|
||||
}
|
||||
}
|
||||
if( stopOfMismatch != -1) {
|
||||
numSinceMismatch++;
|
||||
}
|
||||
if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
|
||||
final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
|
||||
final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
|
||||
final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refBases, true ) );
|
||||
snpAlleles.add( Allele.create( mismatchBases, false ) );
|
||||
final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
|
||||
vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
|
||||
numSinceMismatch = -1;
|
||||
stopOfMismatch = -1;
|
||||
startOfMismatch = -1;
|
||||
refPosStartOfMismatch = -1;
|
||||
}
|
||||
refPos++;
|
||||
alignmentPos++;
|
||||
}
|
||||
break;
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
|
||||
}
|
||||
}
|
||||
return vcs;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,558 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Input bam file(s) from which to make calls
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* VCF file with raw, unrecalibrated SNP and indel calls.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T HaplotypeCaller
|
||||
* -R reference/human_g1k_v37.fasta
|
||||
* -I input.bam
|
||||
* -o output.raw.snps.indels.vcf
|
||||
* </pre>
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since 8/22/11
|
||||
*/
|
||||
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
@ActiveRegionExtension(extension=65, maxRegion=275)
|
||||
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatibleWalker {
|
||||
|
||||
/**
|
||||
* A raw, unfiltered, highly specific callset in VCF format.
|
||||
*/
|
||||
@Output(doc="File to which variants should be written", required = true)
|
||||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
@Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false)
|
||||
protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN;
|
||||
|
||||
@Argument(fullName="keepRG", shortName="keepRG", doc="keepRG", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
@Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false)
|
||||
protected int MNP_LOOK_AHEAD = 0;
|
||||
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = true)
|
||||
protected int MIN_PRUNE_FACTOR = 0;
|
||||
|
||||
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
|
||||
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
|
||||
|
||||
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
|
||||
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
|
||||
|
||||
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="gcpHMM", required = false)
|
||||
protected int gcpHMM = 10;
|
||||
|
||||
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage per sample to downsample each region to", required = false)
|
||||
protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
|
||||
|
||||
@Argument(fullName="useExpandedTriggerSet", shortName="expandedTriggers", doc = "If specified, use additional, experimental triggers designed to capture larger indels but which may lead to an increase in the false positive rate", required=false)
|
||||
protected boolean USE_EXPANDED_TRIGGER_SET = false;
|
||||
|
||||
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
|
||||
protected boolean USE_ALLELES_TRIGGER = false;
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
|
||||
/**
|
||||
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
||||
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
||||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
||||
// The following are not used by the Unified Genotyper
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
public boolean alwaysAppendDbsnpId() { return false; }
|
||||
|
||||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
|
||||
|
||||
/**
|
||||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
||||
@ArgumentCollection
|
||||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
private UnifiedGenotyperEngine UG_engine_simple_genotyper = null;
|
||||
|
||||
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
|
||||
protected boolean DEBUG;
|
||||
|
||||
@Argument(fullName="doBanded", shortName="doBanded", doc="If specified, use the banded option", required = false)
|
||||
protected boolean doBanded = false;
|
||||
|
||||
// the assembly engine
|
||||
LocalAssemblyEngine assemblyEngine = null;
|
||||
|
||||
// the likelihoods engine
|
||||
LikelihoodCalculationEngine likelihoodCalculationEngine = null;
|
||||
|
||||
// the genotyping engine
|
||||
GenotypingEngine genotypingEngine = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 900;
|
||||
|
||||
// bases with quality less than or equal to this value are trimmed off the tails of the reads
|
||||
private static final byte MIN_TAIL_QUALITY = 20;
|
||||
|
||||
private ArrayList<String> samplesList = new ArrayList<String>();
|
||||
private final static double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
|
||||
private final ArrayList<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
|
||||
|
||||
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
private final static Allele FAKE_ALT_ALLELE = Allele.create("<FAKE_ALT>", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
samplesList.addAll( samples );
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// all annotation fields from VariantAnnotatorEngine
|
||||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
// all callers need to add these standard annotation header lines
|
||||
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
|
||||
VCFConstants.DOWNSAMPLED_KEY,
|
||||
VCFConstants.MLE_ALLELE_COUNT_KEY,
|
||||
VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
||||
// all callers need to add these standard FORMAT field header lines
|
||||
VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true,
|
||||
VCFConstants.GENOTYPE_KEY,
|
||||
VCFConstants.GENOTYPE_QUALITY_KEY,
|
||||
VCFConstants.DEPTH_KEY,
|
||||
VCFConstants.GENOTYPE_PL_KEY);
|
||||
// header lines for the experimental HaplotypeCaller-specific annotations
|
||||
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
|
||||
try {
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, doBanded );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// isActive
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
// enable deletions in the pileup
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable non primary reads in the active region
|
||||
@Override
|
||||
public boolean wantsNonPrimaryReads() { return true; }
|
||||
|
||||
@Override
|
||||
@Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
public double isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
|
||||
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
|
||||
if( !allelesToGenotype.contains(vc) ) {
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
|
||||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
if( USE_ALLELES_TRIGGER ) {
|
||||
return ( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
|
||||
}
|
||||
|
||||
if( context == null ) { return 0.0; }
|
||||
|
||||
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
||||
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
|
||||
for( final String sample : splitContexts.keySet() ) {
|
||||
final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
|
||||
Arrays.fill(genotypeLikelihoods, 0.0);
|
||||
|
||||
for( final PileupElement p : splitContexts.get(sample).getBasePileup() ) {
|
||||
final byte qual = ( USE_EXPANDED_TRIGGER_SET ?
|
||||
( p.isNextToSoftClip() || p.isBeforeInsertion() || p.isAfterInsertion() ? ( p.getQual() > QualityUtils.MIN_USABLE_Q_SCORE ? p.getQual() : (byte) 20 ) : p.getQual() )
|
||||
: p.getQual() );
|
||||
if( p.isDeletion() || qual > (USE_EXPANDED_TRIGGER_SET ? QualityUtils.MIN_USABLE_Q_SCORE : (byte) 18) ) {
|
||||
int AA = 0; final int AB = 1; int BB = 2;
|
||||
if( USE_EXPANDED_TRIGGER_SET ) {
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ||
|
||||
(!p.getRead().getNGSPlatform().equals(NGSPlatform.SOLID) && ((p.getRead().getReadPairedFlag() && p.getRead().getMateUnmappedFlag()) || BadMateFilter.hasBadMate(p.getRead()))) ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
}
|
||||
} else {
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
|
||||
genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
|
||||
genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
|
||||
}
|
||||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
|
||||
final ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add( FAKE_REF_ALLELE );
|
||||
alleles.add( FAKE_ALT_ALLELE );
|
||||
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
return ( vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ) );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||
|
||||
final ArrayList<VariantContext> activeAllelesToGenotype = new ArrayList<VariantContext>();
|
||||
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : allelesToGenotype ) {
|
||||
if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
|
||||
activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode
|
||||
}
|
||||
}
|
||||
allelesToGenotype.removeAll( activeAllelesToGenotype );
|
||||
}
|
||||
|
||||
if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do!
|
||||
if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do!
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
|
||||
|
||||
finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
referenceHaplotype.setIsReference(true);
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
|
||||
//int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
|
||||
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
|
||||
|
||||
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
|
||||
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() )
|
||||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
|
||||
// add some custom annotations to the calls
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
// Calculate the number of variants on the haplotype
|
||||
int maxNumVar = 0;
|
||||
for( final Allele allele : callResult.getFirst().getAlleles() ) {
|
||||
if( !allele.isReference() ) {
|
||||
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
|
||||
final int numVar = haplotype.getEventMap().size();
|
||||
if( numVar > maxNumVar ) { maxNumVar = numVar; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// Calculate the event length
|
||||
int maxLength = 0;
|
||||
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
|
||||
final int length = a.length() - annotatedCall.getReference().length();
|
||||
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
|
||||
}
|
||||
|
||||
myAttributes.put("NVH", maxNumVar);
|
||||
myAttributes.put("NumHapEval", bestHaplotypes.size());
|
||||
myAttributes.put("NumHapAssembly", haplotypes.size());
|
||||
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
|
||||
myAttributes.put("EVENTLENGTH", maxLength);
|
||||
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
|
||||
myAttributes.put("extType", annotatedCall.getType().toString() );
|
||||
|
||||
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
|
||||
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
|
||||
//}
|
||||
if( annotatedCall.hasAttribute("QD") ) {
|
||||
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
|
||||
}
|
||||
|
||||
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
|
||||
return 1; // One active region was processed during this map call
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer cur, Integer sum) {
|
||||
return cur + sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info("Ran local assembly on " + result + " active regions");
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// private helper functions
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getExtendedLoc() + " with " + activeRegion.size() + " reads:"); }
|
||||
final ArrayList<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) );
|
||||
activeRegion.clearReads();
|
||||
|
||||
// Join overlapping paired reads to create a single longer read
|
||||
finalizedReadList.addAll( fragmentCollection.getSingletonReads() );
|
||||
for( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
|
||||
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
|
||||
}
|
||||
|
||||
Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator());
|
||||
|
||||
// Loop through the reads hard clipping the adaptor and low quality tails
|
||||
for( final GATKSAMRecord myRead : finalizedReadList ) {
|
||||
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
|
||||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
// protect against INTERVALS with abnormally high coverage
|
||||
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
activeRegion.add(clippedRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion ) {
|
||||
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < 24 || rec.getMappingQuality() <= 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
readsToRemove.add(rec);
|
||||
}
|
||||
}
|
||||
activeRegion.removeAll( readsToRemove );
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) {
|
||||
final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1);
|
||||
final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength());
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight);
|
||||
}
|
||||
|
||||
private HashMap<String, ArrayList<GATKSAMRecord>> splitReadsBySample( final List<GATKSAMRecord> reads ) {
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> returnMap = new HashMap<String, ArrayList<GATKSAMRecord>>();
|
||||
for( final String sample : samplesList) {
|
||||
ArrayList<GATKSAMRecord> readList = returnMap.get( sample );
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
returnMap.put(sample, readList);
|
||||
}
|
||||
}
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
returnMap.get(read.getReadGroup().getSample()).add(read);
|
||||
}
|
||||
|
||||
return returnMap;
|
||||
}
|
||||
|
||||
/*
|
||||
private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) {
|
||||
final ArrayList<Integer> readLengthDistribution = new ArrayList<Integer>();
|
||||
for( final GATKSAMRecord read : activeRegion.getReads() ) {
|
||||
readLengthDistribution.add(read.getReadLength());
|
||||
}
|
||||
final double meanReadLength = MathUtils.average(readLengthDistribution);
|
||||
final double meanCoveragePerSample = (double) activeRegion.getReads().size() / ((double) activeRegion.getExtendedLoc().size() / meanReadLength) / (double) samplesList.size();
|
||||
int PRUNE_FACTOR = 0;
|
||||
if( meanCoveragePerSample > 8.5 ) {
|
||||
PRUNE_FACTOR = (int) Math.floor( Math.sqrt( meanCoveragePerSample - 5.0 ) );
|
||||
} else if( meanCoveragePerSample > 3.0 ) {
|
||||
PRUNE_FACTOR = 1;
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println(String.format("Mean coverage per sample = %.1f --> prune factor = %d", meanCoveragePerSample, PRUNE_FACTOR)); }
|
||||
return PRUNE_FACTOR;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
@ -0,0 +1,441 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Haplotype-based resolution of variants in 2 different eval files.
|
||||
*
|
||||
* <p>
|
||||
* HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them.
|
||||
* From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants.
|
||||
* Records are annotated with the set and status attributes.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* 2 variant files to resolve.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A single consensus VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx1g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T HaplotypeResolver \
|
||||
* -V:v1 input1.vcf \
|
||||
* -V:v2 input2.vcf \
|
||||
* -o output.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW))
|
||||
public class HaplotypeResolver extends RodWalker<Integer, Integer> {
|
||||
|
||||
protected static final String INTERSECTION_SET = "intersection";
|
||||
protected static final String SAME_STATUS = "same";
|
||||
protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch";
|
||||
protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles";
|
||||
protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype";
|
||||
protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther";
|
||||
protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents";
|
||||
|
||||
protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50;
|
||||
protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000;
|
||||
protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100;
|
||||
protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER;
|
||||
|
||||
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||
public List<RodBinding<VariantContext>> variants;
|
||||
|
||||
@Output(doc="File to which variants should be written", required=true)
|
||||
protected VariantContextWriter baseWriter = null;
|
||||
private VariantContextWriter writer;
|
||||
|
||||
/**
|
||||
* Set to 'null' if you don't want the set field emitted.
|
||||
*/
|
||||
@Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
|
||||
protected String SET_KEY = "set";
|
||||
|
||||
/**
|
||||
* Set to 'null' if you don't want the status field emitted.
|
||||
*/
|
||||
@Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false)
|
||||
protected String STATUS_KEY = "status";
|
||||
|
||||
private final LinkedList<VCcontext> queue = new LinkedList<VCcontext>();
|
||||
private String source1, source2;
|
||||
private final List<VariantContext> sourceVCs1 = new ArrayList<VariantContext>();
|
||||
private final List<VariantContext> sourceVCs2 = new ArrayList<VariantContext>();
|
||||
|
||||
|
||||
private class VCcontext {
|
||||
public final Collection<VariantContext> vcs;
|
||||
public final GenomeLoc loc;
|
||||
public final ReferenceContext ref;
|
||||
|
||||
public VCcontext(final Collection<VariantContext> vcs, final ReferenceContext ref) {
|
||||
this.vcs = vcs;
|
||||
this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next());
|
||||
this.ref = ref;
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
|
||||
if ( variants.size() != 2 ) {
|
||||
throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files");
|
||||
}
|
||||
source1 = variants.get(0).getName();
|
||||
source2 = variants.get(1).getName();
|
||||
|
||||
if ( SET_KEY.toLowerCase().equals("null") )
|
||||
SET_KEY = null;
|
||||
if ( STATUS_KEY.toLowerCase().equals("null") )
|
||||
STATUS_KEY = null;
|
||||
|
||||
// for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header)
|
||||
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
|
||||
if ( SET_KEY != null )
|
||||
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record"));
|
||||
if ( STATUS_KEY != null )
|
||||
headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match"));
|
||||
final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.<String>emptySet());
|
||||
baseWriter.writeHeader(vcfHeader);
|
||||
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
final Collection<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
|
||||
if ( VCs.size() == 0 )
|
||||
return 0;
|
||||
|
||||
final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref);
|
||||
|
||||
// TODO -- what should we do about filtered records?
|
||||
|
||||
if ( !queue.isEmpty() ) {
|
||||
|
||||
final VCcontext previous = queue.getLast();
|
||||
if ( !previous.loc.onSameContig(vc.loc) ||
|
||||
previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS ||
|
||||
queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) {
|
||||
purgeQueue();
|
||||
}
|
||||
}
|
||||
|
||||
queue.addLast(vc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
if ( !queue.isEmpty() )
|
||||
purgeQueue();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void purgeQueue() {
|
||||
|
||||
final ReferenceContext refContext = queue.getFirst().ref;
|
||||
|
||||
// divide them up by source
|
||||
while ( !queue.isEmpty() ) {
|
||||
VCcontext context = queue.removeFirst();
|
||||
for ( final VariantContext vc: context.vcs ) {
|
||||
if ( vc.getSource().equals(source1) )
|
||||
sourceVCs1.add(vc);
|
||||
else
|
||||
sourceVCs2.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS);
|
||||
|
||||
if ( sourceVCs1.isEmpty() ) {
|
||||
writeAll(sourceVCs2, source2, null);
|
||||
} else if ( sourceVCs2.isEmpty() ) {
|
||||
writeAll(sourceVCs1, source1, null);
|
||||
} else {
|
||||
resolveByHaplotype(refContext);
|
||||
}
|
||||
|
||||
// allow for GC of the data
|
||||
sourceVCs1.clear();
|
||||
sourceVCs2.clear();
|
||||
}
|
||||
|
||||
private void writeAll(final List<VariantContext> sourceVCs, final String set, final String status) {
|
||||
for ( final VariantContext vc : sourceVCs ) {
|
||||
writeOne(vc, set, status);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeOne(final VariantContext vc, final String set, final String status) {
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
if ( SET_KEY != null && set != null )
|
||||
attrs.put(SET_KEY, set);
|
||||
if ( STATUS_KEY != null && status != null )
|
||||
attrs.put(STATUS_KEY, status);
|
||||
writer.add(new VariantContextBuilder(vc).attributes(attrs).make());
|
||||
}
|
||||
|
||||
private void writeAndPurgeAllEqualVariants(final List<VariantContext> sourceVCs1, final List<VariantContext> sourceVCs2, final String status) {
|
||||
|
||||
int currentIndex1 = 0, currentIndex2 = 0;
|
||||
int size1 = sourceVCs1.size(), size2 = sourceVCs2.size();
|
||||
VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
|
||||
VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
|
||||
|
||||
while ( current1 != null && current2 != null ) {
|
||||
|
||||
final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
|
||||
final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);
|
||||
|
||||
if ( loc1.equals(loc2) ||
|
||||
(loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) {
|
||||
// test the alleles
|
||||
if ( determineAndWriteOverlap(current1, current2, status) ) {
|
||||
sourceVCs1.remove(currentIndex1);
|
||||
sourceVCs2.remove(currentIndex2);
|
||||
size1--;
|
||||
size2--;
|
||||
} else {
|
||||
currentIndex1++;
|
||||
currentIndex2++;
|
||||
}
|
||||
current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
|
||||
current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
|
||||
} else if ( loc1.isBefore(loc2) ) {
|
||||
currentIndex1++;
|
||||
current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
|
||||
} else {
|
||||
currentIndex2++;
|
||||
current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) {
|
||||
final int allelesFrom1In2 = findOverlap(vc1, vc2);
|
||||
final int allelesFrom2In1 = findOverlap(vc2, vc1);
|
||||
final int totalAllelesIn1 = vc1.getAlternateAlleles().size();
|
||||
final int totalAllelesIn2 = vc2.getAlternateAlleles().size();
|
||||
|
||||
final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1;
|
||||
final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2;
|
||||
|
||||
boolean thereIsOverlap = true;
|
||||
|
||||
if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) {
|
||||
writeOne(vc1, INTERSECTION_SET, status);
|
||||
} else if ( allAllelesFrom1Overlap ) {
|
||||
writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2);
|
||||
} else if ( allAllelesFrom2Overlap ) {
|
||||
writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1);
|
||||
} else if ( allelesFrom1In2 > 0 ) {
|
||||
writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS);
|
||||
} else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction
|
||||
writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS);
|
||||
} else {
|
||||
thereIsOverlap = false;
|
||||
}
|
||||
|
||||
return thereIsOverlap;
|
||||
}
|
||||
|
||||
private static int findOverlap(final VariantContext target, final VariantContext comparison) {
|
||||
int overlap = 0;
|
||||
for ( final Allele allele : target.getAlternateAlleles() ) {
|
||||
if ( comparison.hasAlternateAllele(allele) )
|
||||
overlap++;
|
||||
}
|
||||
return overlap;
|
||||
}
|
||||
|
||||
private static final double SW_MATCH = 4.0;
|
||||
private static final double SW_MISMATCH = -10.0;
|
||||
private static final double SW_GAP = -25.0;
|
||||
private static final double SW_GAP_EXTEND = -1.3;
|
||||
private void resolveByHaplotype(final ReferenceContext refContext) {
|
||||
|
||||
final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext);
|
||||
final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext);
|
||||
|
||||
final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
|
||||
// protect against SW failures
|
||||
if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 ||
|
||||
swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) {
|
||||
// TODO -- handle errors appropriately
|
||||
logger.debug("Bad SW alignment; aborting at " + refContext.getLocus());
|
||||
return;
|
||||
}
|
||||
|
||||
// order results by start position
|
||||
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0));
|
||||
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0));
|
||||
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
|
||||
// TODO -- handle errors appropriately
|
||||
logger.debug("No source alleles; aborting at " + refContext.getLocus());
|
||||
return;
|
||||
}
|
||||
|
||||
// create lists and test for equality
|
||||
final List<VariantContext> source1Alleles = new ArrayList<VariantContext>(source1Map.values());
|
||||
final List<VariantContext> source2Alleles = new ArrayList<VariantContext>(source2Map.values());
|
||||
|
||||
writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS);
|
||||
if ( source1Alleles.isEmpty() ) {
|
||||
writeAll(source2Alleles, source2, null);
|
||||
} else if ( source2Alleles.isEmpty() ) {
|
||||
writeAll(source1Alleles, source1, null);
|
||||
} else {
|
||||
writeDifferences(source1Alleles, source2Alleles);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] generateHaplotype(final List<VariantContext> sourceVCs, final ReferenceContext refContext) {
|
||||
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
final int startPos = refContext.getWindow().getStart();
|
||||
int currentPos = startPos;
|
||||
final byte[] reference = refContext.getBases();
|
||||
|
||||
for ( final VariantContext vc : sourceVCs ) {
|
||||
// add any missing reference context
|
||||
int vcStart = vc.getStart();
|
||||
final int refAlleleLength = vc.getReference().length();
|
||||
if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position)
|
||||
vcStart++;
|
||||
|
||||
while ( currentPos < vcStart )
|
||||
sb.append((char)reference[currentPos++ - startPos]);
|
||||
|
||||
// add the alt allele
|
||||
sb.append(vc.getAlternateAllele(0).getBaseString());
|
||||
|
||||
// skip the reference allele
|
||||
currentPos += refAlleleLength;
|
||||
}
|
||||
// add any missing reference context
|
||||
final int stopPos = refContext.getWindow().getStop();
|
||||
while ( currentPos < stopPos )
|
||||
sb.append((char)reference[currentPos++ - startPos]);
|
||||
|
||||
return sb.toString().getBytes();
|
||||
}
|
||||
|
||||
private void writeDifferences(final List<VariantContext> source1Alleles, final List<VariantContext> source2Alleles) {
|
||||
int currentIndex1 = 0, currentIndex2 = 0;
|
||||
final int size1 = source1Alleles.size(), size2 = source2Alleles.size();
|
||||
VariantContext current1 = source1Alleles.get(0);
|
||||
VariantContext current2 = source2Alleles.get(0);
|
||||
|
||||
while ( currentIndex1 < size1 || currentIndex2 < size2 ) {
|
||||
if ( current1 == null ) {
|
||||
writeOne(current2, source2, null);
|
||||
currentIndex2++;
|
||||
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
|
||||
} else if ( current2 == null ) {
|
||||
writeOne(current1, source1, null);
|
||||
currentIndex1++;
|
||||
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
|
||||
} else {
|
||||
|
||||
final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
|
||||
final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);
|
||||
|
||||
if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) {
|
||||
String status;
|
||||
if ( loc1.getStart() == loc2.getStart() ) {
|
||||
final String allele1 = current1.getAlternateAllele(0).getBaseString();
|
||||
final String allele2 = current2.getAlternateAllele(0).getBaseString();
|
||||
if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 )
|
||||
status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS;
|
||||
else
|
||||
status = SAME_START_DIFFERENT_ALLELES_STATUS;
|
||||
} else {
|
||||
status = OVERLAPPING_EVENTS_STATUS;
|
||||
}
|
||||
|
||||
writeOne(current1, INTERSECTION_SET, status);
|
||||
currentIndex1++;
|
||||
currentIndex2++;
|
||||
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
|
||||
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
|
||||
} else if ( loc1.isBefore(loc2) ) {
|
||||
writeOne(current1, source1, null);
|
||||
currentIndex1++;
|
||||
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
|
||||
} else {
|
||||
writeOne(current2, source2, null);
|
||||
currentIndex2++;
|
||||
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,149 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
|
||||
// This is different from most graph traversals because we want to test paths from any source node to any sink node.
|
||||
public class KBestPaths {
|
||||
|
||||
// static access only
|
||||
protected KBestPaths() { }
|
||||
private static int MAX_PATHS_TO_HOLD = 100;
|
||||
|
||||
protected static class MyInt { public int val = 0; }
|
||||
|
||||
// class to keep track of paths
|
||||
protected static class Path {
|
||||
|
||||
// the last vertex seen in the path
|
||||
private DeBruijnVertex lastVertex;
|
||||
|
||||
// the list of edges comprising the path
|
||||
private ArrayList<DeBruijnEdge> edges;
|
||||
|
||||
// the scores for the path
|
||||
private int totalScore = 0, lowestEdge = -1;
|
||||
|
||||
public Path( final DeBruijnVertex initialVertex ) {
|
||||
lastVertex = initialVertex;
|
||||
edges = new ArrayList<DeBruijnEdge>(0);
|
||||
}
|
||||
|
||||
public Path( final Path p, final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
|
||||
lastVertex = graph.getEdgeTarget(edge);
|
||||
edges = new ArrayList<DeBruijnEdge>(p.edges);
|
||||
edges.add(edge);
|
||||
totalScore = p.totalScore + edge.getMultiplicity();
|
||||
lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity());
|
||||
}
|
||||
|
||||
public boolean containsEdge( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
|
||||
final DeBruijnVertex targetVertex = graph.getEdgeTarget(edge);
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
if( e.equals(graph, edge) || graph.getEdgeTarget(e).equals(targetVertex) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public ArrayList<DeBruijnEdge> getEdges() { return edges; }
|
||||
|
||||
public int getScore() { return totalScore; }
|
||||
|
||||
public int getLowestEdge() { return lowestEdge; }
|
||||
|
||||
public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
|
||||
|
||||
public byte[] getBases( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
|
||||
if( edges.size() == 0 ) { return lastVertex.getSequence(); }
|
||||
|
||||
byte[] bases = graph.getEdgeSource( edges.get(0) ).getSequence();
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
bases = ArrayUtils.addAll(bases, graph.getEdgeTarget( e ).getSuffix());
|
||||
}
|
||||
return bases;
|
||||
}
|
||||
}
|
||||
|
||||
protected static class PathComparatorTotalScore implements Comparator<Path> {
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path1.totalScore - path2.totalScore;
|
||||
}
|
||||
}
|
||||
|
||||
protected static class PathComparatorLowestEdge implements Comparator<Path> {
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path2.lowestEdge - path1.lowestEdge;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Path> getKBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final int k ) {
|
||||
if( k > MAX_PATHS_TO_HOLD/2 ) { throw new ReviewedStingException("Asked for more paths than MAX_PATHS_TO_HOLD!"); }
|
||||
final ArrayList<Path> bestPaths = new ArrayList<Path>();
|
||||
|
||||
// run a DFS for best paths
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 ) {
|
||||
findBestPaths(graph, new Path(v), bestPaths);
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(bestPaths, new PathComparatorLowestEdge() );
|
||||
Collections.reverse(bestPaths);
|
||||
return bestPaths.subList(0, Math.min(k, bestPaths.size()));
|
||||
}
|
||||
|
||||
private static void findBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path, final List<Path> bestPaths ) {
|
||||
findBestPaths(graph, path, bestPaths, new MyInt());
|
||||
}
|
||||
|
||||
private static void findBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path, final List<Path> bestPaths, MyInt n ) {
|
||||
|
||||
// did we hit the end of a path?
|
||||
if ( allOutgoingEdgesHaveBeenVisited(graph, path) ) {
|
||||
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
|
||||
// clean out some low scoring paths
|
||||
Collections.sort(bestPaths, new PathComparatorLowestEdge() );
|
||||
for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); }
|
||||
}
|
||||
bestPaths.add(path);
|
||||
} else if( n.val > 10000) {
|
||||
// do nothing, just return
|
||||
} else {
|
||||
// recursively run DFS
|
||||
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
|
||||
edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex));
|
||||
Collections.sort(edgeArrayList);
|
||||
Collections.reverse(edgeArrayList);
|
||||
for ( final DeBruijnEdge edge : edgeArrayList ) {
|
||||
// make sure the edge is not already in the path
|
||||
if ( path.containsEdge(graph, edge) )
|
||||
continue;
|
||||
|
||||
final Path newPath = new Path(path, graph, edge);
|
||||
n.val++;
|
||||
findBestPaths(graph, newPath, bestPaths, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean allOutgoingEdgesHaveBeenVisited( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path ) {
|
||||
for( final DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) {
|
||||
if( !path.containsEdge(graph, edge) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,403 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
private final byte constantGCP;
|
||||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
|
||||
pairHMM = new PairHMM( noBanded );
|
||||
this.constantGCP = constantGCP;
|
||||
DEBUG = debug;
|
||||
}
|
||||
|
||||
public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
for( final GATKSAMRecord read : perSampleReadList.get(sample) ) {
|
||||
final int readLength = read.getReadLength();
|
||||
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
|
||||
}
|
||||
}
|
||||
int Y_METRIC_LENGTH = 0;
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final int haplotypeLength = haplotypes.get(jjj).getBases().length;
|
||||
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
|
||||
}
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
X_METRIC_LENGTH += 2;
|
||||
Y_METRIC_LENGTH += 2;
|
||||
|
||||
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
|
||||
// for each sample's reads
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray );
|
||||
}
|
||||
}
|
||||
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final int numReads = reads.size();
|
||||
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
final GATKSAMRecord read = reads.get(iii);
|
||||
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
Haplotype previousHaplotypeSeen = null;
|
||||
final byte[] readQuals = read.getBaseQualities();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities();
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
|
||||
readQuals[kkk] = ( readQuals[kkk] < (byte) 17 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP,
|
||||
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] );
|
||||
}
|
||||
}
|
||||
|
||||
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
|
||||
for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
|
||||
if( b1[iii] != b2[iii] ) {
|
||||
return iii;
|
||||
}
|
||||
}
|
||||
return b1.length;
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
|
||||
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
|
||||
}
|
||||
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
|
||||
|
||||
final int numHaplotypes = haplotypeMapping.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
|
||||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final Set<String> samples ) {
|
||||
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
|
||||
final int numHaplotypes = haplotypeMapping.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
|
||||
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
|
||||
final int numHaplotypes = likelihoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
|
||||
int index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj];
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
|
||||
index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++];
|
||||
}
|
||||
}
|
||||
return likelihoodMatrix;
|
||||
}
|
||||
|
||||
/*
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
|
||||
|
||||
// BUGBUG: This function needs a lot of work. Need to use 4-gamete test or Tajima's D to decide to break up events into separate pieces for genotyping
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
|
||||
final double[][][] haplotypeLikelihoodMatrix = new double[sampleKeySet.size()][numHaplotypes][numHaplotypes];
|
||||
|
||||
int sampleCount = 0;
|
||||
for( final String sample : sampleKeySet ) {
|
||||
haplotypeLikelihoodMatrix[sampleCount++] = computeDiploidHaplotypeLikelihoods( haplotypes, sample );
|
||||
}
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
int chosenSample = 0;
|
||||
//double bestElement = Double.NEGATIVE_INFINITY;
|
||||
final int maxChosenHaplotypes = Math.min( 15, sampleKeySet.size() * 2 + 1 );
|
||||
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int kkk = 0; kkk < sampleCount; kkk++ ) {
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
if( haplotypeLikelihoodMatrix[kkk][iii][jjj] > maxElement ) {
|
||||
maxElement = haplotypeLikelihoodMatrix[kkk][iii][jjj];
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
chosenSample = kkk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
haplotypeLikelihoodMatrix[chosenSample][iii][jjj] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
|
||||
final ArrayList<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
bestHaplotypes.add( haplotypes.get(hIndex) );
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
*/
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( haplotypes, sampleKeySet ); // all samples pooled together
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
//double bestElement = Double.NEGATIVE_INFINITY;
|
||||
final int maxChosenHaplotypes = Math.min( 8, sampleKeySet.size() * 2 + 1 );
|
||||
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
|
||||
maxElement = haplotypeLikelihoodMatrix[iii][jjj];
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
|
||||
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
|
||||
final ArrayList<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
bestHaplotypes.add( haplotypes.get(hIndex) );
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static Map<String, Map<Allele, List<GATKSAMRecord>>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Map<String, Map<Allele, List<GATKSAMRecord>>> returnMap = new HashMap<String, Map<Allele, List<GATKSAMRecord>>>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = perSampleReadList.get(sample);
|
||||
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
|
||||
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
final double likelihoods[] = new double[call.getFirst().getAlleles().size()];
|
||||
int count = 0;
|
||||
for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
|
||||
final double likelihood = h.getReadLikelihoods(sample)[iii];
|
||||
if( likelihood > maxLikelihood ) {
|
||||
maxLikelihood = likelihood;
|
||||
}
|
||||
}
|
||||
likelihoods[count++] = maxLikelihood;
|
||||
}
|
||||
final int bestAllele = MathUtils.maxElementIndex(likelihoods);
|
||||
final double bestLikelihood = likelihoods[bestAllele];
|
||||
Allele allele = Allele.NO_CALL;
|
||||
boolean isInformativeRead = false;
|
||||
for( final double likelihood : likelihoods ) {
|
||||
if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) {
|
||||
isInformativeRead = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// uninformative reads get the no call Allele
|
||||
if( isInformativeRead ) {
|
||||
allele = call.getFirst().getAlleles().get(bestAllele);
|
||||
}
|
||||
List<GATKSAMRecord> readList = alleleReadMap.get(allele);
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
alleleReadMap.put(allele, readList);
|
||||
}
|
||||
readList.add(read);
|
||||
}
|
||||
}
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
List<GATKSAMRecord> readList = alleleReadMap.get(Allele.NO_CALL);
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
alleleReadMap.put(Allele.NO_CALL, readList);
|
||||
}
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
readList.add(read);
|
||||
}
|
||||
}
|
||||
returnMap.put(sample, alleleReadMap);
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 14, 2011
|
||||
*/
|
||||
public abstract class LocalAssemblyEngine {
|
||||
|
||||
public enum ASSEMBLER {
|
||||
SIMPLE_DE_BRUIJN
|
||||
}
|
||||
|
||||
protected LocalAssemblyEngine() {
|
||||
}
|
||||
|
||||
public abstract ArrayList<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, ArrayList<VariantContext> activeAllelesToGenotype);
|
||||
}
|
||||
|
|
@ -0,0 +1,372 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks, rpoplin
|
||||
* Date: Mar 14, 2011
|
||||
*/
|
||||
|
||||
public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
||||
|
||||
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
|
||||
private static final byte MIN_QUALITY = (byte) 17;
|
||||
|
||||
// Smith-Waterman parameters originally copied from IndelRealigner
|
||||
private static final double SW_MATCH = 5.0; // 1.0;
|
||||
private static final double SW_MISMATCH = -10.0; //-1.0/3.0;
|
||||
private static final double SW_GAP = -22.0; //-1.0-1.0/3.0;
|
||||
private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final PrintStream GRAPH_WRITER;
|
||||
private final ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>> graphs = new ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>>();
|
||||
|
||||
private int PRUNE_FACTOR = 1;
|
||||
|
||||
public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter ) {
|
||||
super();
|
||||
DEBUG = debug;
|
||||
GRAPH_WRITER = graphWriter;
|
||||
}
|
||||
|
||||
public ArrayList<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList<VariantContext> activeAllelesToGenotype ) {
|
||||
this.PRUNE_FACTOR = PRUNE_FACTOR;
|
||||
|
||||
// create the graphs
|
||||
createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
|
||||
// clean up the graphs by pruning and merging
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
pruneGraph( graph, PRUNE_FACTOR );
|
||||
//eliminateNonRefPaths( graph );
|
||||
mergeNodes( graph );
|
||||
}
|
||||
|
||||
if( GRAPH_WRITER != null ) {
|
||||
printGraphs();
|
||||
}
|
||||
|
||||
// find the best paths in the graphs
|
||||
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
|
||||
}
|
||||
|
||||
private void createDeBruijnGraphs( final ArrayList<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
|
||||
graphs.clear();
|
||||
|
||||
// create the graph
|
||||
for( int kmer = 31; kmer <= 75; kmer += 6 ) {
|
||||
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) {
|
||||
graphs.add(graph);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static void mergeNodes( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
for( final DeBruijnEdge e : graph.edgeSet() ) {
|
||||
final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e);
|
||||
final DeBruijnVertex incomingVertex = graph.getEdgeSource(e);
|
||||
if( !outgoingVertex.equals(incomingVertex) && graph.inDegreeOf(outgoingVertex) == 1 && graph.outDegreeOf(incomingVertex) == 1) {
|
||||
final Set<DeBruijnEdge> outEdges = graph.outgoingEdgesOf(outgoingVertex);
|
||||
final Set<DeBruijnEdge> inEdges = graph.incomingEdgesOf(incomingVertex);
|
||||
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
} else if( inEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
} else if( outEdges.size() == 1 ) {
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
}
|
||||
|
||||
final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer );
|
||||
graph.addVertex(addedVertex);
|
||||
for( final DeBruijnEdge edge : outEdges ) {
|
||||
graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity()));
|
||||
}
|
||||
for( final DeBruijnEdge edge : inEdges ) {
|
||||
graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity()));
|
||||
}
|
||||
|
||||
graph.removeVertex( incomingVertex );
|
||||
graph.removeVertex( outgoingVertex );
|
||||
foundNodesToMerge = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static void pruneGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final int pruneFactor ) {
|
||||
final ArrayList<DeBruijnEdge> edgesToRemove = new ArrayList<DeBruijnEdge>();
|
||||
for( final DeBruijnEdge e : graph.edgeSet() ) {
|
||||
if( e.getMultiplicity() <= pruneFactor && !e.getIsRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
|
||||
edgesToRemove.add(e);
|
||||
}
|
||||
}
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final ArrayList<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
graph.removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
protected static void eliminateNonRefPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
|
||||
final ArrayList<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
|
||||
boolean done = false;
|
||||
while( !done ) {
|
||||
done = true;
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 || graph.outDegreeOf(v) == 0 ) {
|
||||
boolean isRefNode = false;
|
||||
for( final DeBruijnEdge e : graph.edgesOf(v) ) {
|
||||
if( e.getIsRef() ) {
|
||||
isRefNode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !isRefNode ) {
|
||||
done = false;
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
graph.removeAllVertices(verticesToRemove);
|
||||
verticesToRemove.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final ArrayList<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
|
||||
final byte[] refSequence = refHaplotype.getBases();
|
||||
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
|
||||
for (int i = 0; i < kmersInSequence - 1; i++) {
|
||||
// get the kmers
|
||||
final byte[] kmer1 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(refSequence, i, kmer1, 0, KMER_LENGTH);
|
||||
final byte[] kmer2 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(refSequence, i+1, kmer2, 0, KMER_LENGTH);
|
||||
if( !addKmersToGraph(graph, kmer1, kmer2, true) ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
// if the qualities of all the bases in the kmers are high enough
|
||||
boolean badKmer = false;
|
||||
for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) {
|
||||
if( qualities[jjj] < MIN_QUALITY ) {
|
||||
badKmer = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !badKmer ) {
|
||||
// get the kmers
|
||||
final byte[] kmer1 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(sequence, iii, kmer1, 0, KMER_LENGTH);
|
||||
final byte[] kmer2 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);
|
||||
|
||||
addKmersToGraph(graph, kmer1, kmer2, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected static boolean addKmersToGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final byte[] kmer1, final byte[] kmer2, final boolean isRef ) {
|
||||
|
||||
final int numVertexBefore = graph.vertexSet().size();
|
||||
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length );
|
||||
graph.addVertex(v1);
|
||||
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length );
|
||||
graph.addVertex(v2);
|
||||
if( isRef && graph.vertexSet().size() == numVertexBefore ) { return false; }
|
||||
|
||||
final DeBruijnEdge targetEdge = graph.getEdge(v1, v2);
|
||||
if ( targetEdge == null ) {
|
||||
graph.addEdge(v1, v2, new DeBruijnEdge( isRef ));
|
||||
} else {
|
||||
if( isRef ) {
|
||||
targetEdge.setIsRef( true );
|
||||
}
|
||||
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private void printGraphs() {
|
||||
int count = 0;
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
GRAPH_WRITER.println("digraph kmer" + count++ +" {");
|
||||
for( final DeBruijnEdge edge : graph.edgeSet() ) {
|
||||
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
|
||||
}
|
||||
if( edge.getIsRef() ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
if( !edge.getIsRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
|
||||
}
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
final String label = ( graph.inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() );
|
||||
GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]");
|
||||
}
|
||||
GRAPH_WRITER.println("}");
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
private ArrayList<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final ArrayList<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
|
||||
final ArrayList<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
|
||||
|
||||
// add the reference haplotype separately from all the others
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( fullReferenceWithPadding, refHaplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
refHaplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
refHaplotype.setCigar( swConsensus.getCigar() );
|
||||
if( !returnHaplotypes.add( refHaplotype ) ) {
|
||||
throw new ReviewedStingException("Unable to add reference haplotype during assembly: " + refHaplotype);
|
||||
}
|
||||
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
|
||||
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
|
||||
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
return returnHaplotypes;
|
||||
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
|
||||
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
|
||||
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
|
||||
} else {
|
||||
System.out.println("Found only the reference haplotype in the assembly graph.");
|
||||
}
|
||||
for( final Haplotype h : returnHaplotypes ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
}
|
||||
}
|
||||
|
||||
return returnHaplotypes;
|
||||
}
|
||||
|
||||
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop ) {
|
||||
//final int sizeOfActiveRegion = activeRegionStop - activeRegionStart;
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0) );
|
||||
|
||||
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 ) { // protect against SW failures
|
||||
return false;
|
||||
}
|
||||
|
||||
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true );
|
||||
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
|
||||
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
|
||||
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
|
||||
}
|
||||
byte[] newHaplotypeBases;
|
||||
// extend partial haplotypes to contain the full active region sequence
|
||||
int leftBreakPoint = 0;
|
||||
int rightBreakPoint = 0;
|
||||
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
|
||||
haplotype.getBases()),
|
||||
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
|
||||
leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart;
|
||||
rightBreakPoint = leftBreakPoint + haplotype.getBases().length;
|
||||
//newHaplotypeBases = haplotype.getBases();
|
||||
//return false; // piece of haplotype isn't anchored within the active region so don't build a haplotype out of it
|
||||
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
//return false;
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
|
||||
//newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), 0, hapStop);
|
||||
leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart;
|
||||
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
//return false;
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
|
||||
//newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length);
|
||||
rightBreakPoint = haplotype.getBases().length - hapStart;
|
||||
} else {
|
||||
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
|
||||
}
|
||||
|
||||
final Haplotype h = new Haplotype( newHaplotypeBases );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
|
||||
h.leftBreakPoint = leftBreakPoint;
|
||||
h.rightBreakPoint = rightBreakPoint;
|
||||
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
|
||||
return false;
|
||||
}
|
||||
|
||||
if( !haplotypeList.contains(h) ) {
|
||||
haplotypeList.add(h);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue