New implementation of the GGA mode in the HaplotypeCaller

-- We now inject the given alleles into the reference haplotype and add them to the graph.
-- Those paths are read off of the graph and then evaluated with the appropriate marginalization for GGA mode.
-- This unifies how Smith-Waterman is performed between discovery and GGA modes.
-- Misc minor cleanup in several places.
This commit is contained in:
Ryan Poplin 2013-05-22 10:35:19 -04:00
parent 50b4c130ca
commit b5b9d745a7
16 changed files with 135 additions and 269 deletions

View File

@ -77,6 +77,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
private final static int NUM_PATHS_PER_GRAPH = 25;
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int GRAPH_KMER_STEP = 6;
private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
private final int minKmer;
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
@ -92,8 +93,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
@Override
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
final List<SeqGraph> graphs = new LinkedList<SeqGraph>();
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final List<SeqGraph> graphs = new LinkedList<>();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
if( maxKmer < minKmer) {
@ -106,7 +107,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
continue;
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype);
DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
@ -133,7 +134,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype ) {
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
@ -142,8 +143,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
// something went wrong, so abort right now with a null graph
return null;
// now go through the graph already seeded with the reference sequence and add the read kmers to it
if ( ! addReadKmersToGraph(builder, reads) )
// now go through the graph already seeded with the reference sequence and add the read kmers to it as well as the artificial GGA haplotypes
if ( ! addReadKmersToGraph(builder, reads, activeAlleleHaplotypes) )
// some problem was detected adding the reads to the graph, return null to indicate we failed
return null;
@ -156,11 +157,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
*
* @param builder a debruijn graph builder to add the read kmers to
* @param reads a non-null list of reads whose kmers we want to add to the graph
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads, final List<Haplotype> activeAlleleHaplotypes) {
final int kmerLength = builder.getKmerSize();
// First pull kmers out of the artificial GGA haplotypes and throw them on the graph
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
final int end = haplotype.length() - kmerLength;
for( int start = 0; start < end; start++ ) {
builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
}
}
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();

View File

@ -71,7 +71,7 @@ public class GenotypingEngine {
private final boolean DEBUG;
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static List<Allele> noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
private final VariantAnnotatorEngine annotationEngine;
private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@ -162,8 +162,8 @@ public class GenotypingEngine {
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
// Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
final Set<Haplotype> calledHaplotypes = new HashSet<>();
final List<VariantContext> returnCalls = new ArrayList<>();
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
@ -183,7 +183,7 @@ public class GenotypingEngine {
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
}
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<VariantContext, Allele>();
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) {
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
@ -244,7 +244,7 @@ public class GenotypingEngine {
if ( in_GGA_mode ) startPosKeySet.clear();
cleanUpSymbolicUnassembledEvents( haplotypes );
//cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently
if ( !in_GGA_mode ) {
// run the event merger if we're not in GGA mode
final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc);
@ -267,7 +267,7 @@ public class GenotypingEngine {
* @return the list of the sources of vcs in the same order
*/
private List<String> makePriorityList(final List<VariantContext> vcs) {
final List<String> priorityList = new LinkedList<String>();
final List<String> priorityList = new LinkedList<>();
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
return priorityList;
}
@ -276,7 +276,7 @@ public class GenotypingEngine {
final int loc,
final List<VariantContext> activeAllelesToGenotype) {
// the overlapping events to merge into a common reference view
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
final List<VariantContext> eventsAtThisLoc = new ArrayList<>();
if( activeAllelesToGenotype.isEmpty() ) {
for( final Haplotype h : haplotypes ) {
@ -292,7 +292,7 @@ public class GenotypingEngine {
if( compVC.getStart() == loc ) {
int alleleCount = 0;
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
List<Allele> alleleSet = new ArrayList<Allele>(2);
List<Allele> alleleSet = new ArrayList<>(2);
alleleSet.add(compVC.getReference());
alleleSet.add(compAltAllele);
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
@ -348,7 +348,7 @@ public class GenotypingEngine {
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final VariantContext call ) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<>();
final GenomeLoc callLoc = parser.createGenomeLoc(call);
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
@ -384,7 +384,7 @@ public class GenotypingEngine {
// TODO - split into input haplotypes and output haplotypes as not to share I/O arguments
@Requires("haplotypes != null")
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
final List<Haplotype> haplotypesToRemove = new ArrayList<>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
if( vc.isSymbolic() ) {
@ -407,7 +407,7 @@ public class GenotypingEngine {
final Map<Allele, List<Haplotype>> alleleMapper,
final double downsamplingFraction ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
@ -430,7 +430,7 @@ public class GenotypingEngine {
}
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) {
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<Allele, List<Haplotype>>();
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<>();
for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) {
alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey())));
}
@ -441,100 +441,33 @@ public class GenotypingEngine {
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
protected static Map<Event, List<Haplotype>> createEventMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<Event, List<Haplotype>>(eventsAtThisLoc.size()+1);
VariantContext refVC = eventsAtThisLoc.get(0); // the genome loc is the only safe thing to pull out of this VC because ref/alt pairs might change reference basis
eventMapper.put(new Event(null), new ArrayList<Haplotype>());
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1);
final Event refEvent = new Event(null);
eventMapper.put(refEvent, new ArrayList<Haplotype>());
for( final VariantContext vc : eventsAtThisLoc ) {
eventMapper.put(new Event(vc), new ArrayList<Haplotype>());
}
final List<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
for( final Haplotype h : haplotypes ) {
if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() ) {
final List<Allele> alleles = new ArrayList<Allele>(2);
alleles.add(h.getArtificialRefAllele());
alleles.add(h.getArtificialAltAllele());
final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype")
.alleles(alleles)
.loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() );
if( eventMapper.containsKey(artificialVC) ) {
eventMapper.get(artificialVC).add(h);
}
} else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later
undeterminedHaplotypes.add(h);
if( h.getEventMap().get(loc) == null ) {
eventMapper.get(refEvent).add(h);
} else {
boolean haplotypeIsDetermined = false;
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
eventMapper.get(new Event(vcAtThisLoc)).add(h);
haplotypeIsDetermined = true;
break;
}
}
if( !haplotypeIsDetermined )
undeterminedHaplotypes.add(h);
}
}
for( final Haplotype h : undeterminedHaplotypes ) {
Event matchingEvent = new Event(null);
for( final Map.Entry<Event, List<Haplotype>> eventToTest : eventMapper.entrySet() ) {
// don't test against the reference allele
if( eventToTest.getKey().equals(new Event(null)) )
continue;
// only try to disambiguate for alleles that have had haplotypes previously assigned above
if( eventToTest.getValue().isEmpty() )
continue;
final Haplotype artificialHaplotype = eventToTest.getValue().get(0);
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
matchingEvent = eventToTest.getKey();
break;
}
}
eventMapper.get(matchingEvent).add(h);
}
return eventMapper;
}
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
if ( fromSuperset == null )
return false;
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
if ( resolveSupersetToSubset )
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
return false;
}
return true;
}
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
if ( targetReference.length() <= actualReference.length() )
return currentAlleles;
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
for ( final Allele a : currentAlleles ) {
newAlleles.add(Allele.extend(a, extraBases));
}
return newAlleles;
}
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final List<List<Haplotype>> alleleMapper, final List<Haplotype> haplotypes ) {
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
final List<Allele> eventAllelesForSample = new ArrayList<>();
for( final Allele a : haplotypeAllelesForSample ) {
final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a));
for( int iii = 0; iii < alleleMapper.size(); iii++ ) {

View File

@ -47,6 +47,9 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
@ -433,8 +436,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private final static int MIN_READ_LENGTH = 10;
private List<String> samplesList = new ArrayList<String>();
private final static double LOG_ONE_HALF = -Math.log10(2.0);
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
private final List<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
@ -603,7 +604,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// if we don't have any data, just abort early
return new ActivityProfileState(ref.getLocus(), 0.0);
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
final List<Allele> noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
@ -625,14 +626,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
}
}
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final List<Allele> alleles = new ArrayList<Allele>();
final List<Allele> alleles = new ArrayList<>();
alleles.add( FAKE_REF_ALLELE );
alleles.add( FAKE_ALT_ALLELE );
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
@ -746,9 +747,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// Create the reference haplotype which is the bases from the reference that make up the active region
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
@ -760,6 +761,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
/**
* Helper function to create the reference haplotype out of the active region and a padded loc
* @param activeRegion the active region from which to generate the reference haplotype
* @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be
* @return a non-null haplotype
*/
private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) {
final Haplotype refHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
refHaplotype.setAlignmentStartHapwrtRef(activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart());
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
return refHaplotype;
}
/**
* Trim down the active region to just enough to properly genotype the events among the haplotypes
*
@ -791,7 +807,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
// trim down the haplotypes
final Set<Haplotype> haplotypeSet = new HashSet<Haplotype>(haplotypes.size());
final Set<Haplotype> haplotypeSet = new HashSet<>(haplotypes.size());
for ( final Haplotype h : haplotypes ) {
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
if ( trimmed != null ) {
@ -802,7 +818,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
// create the final list of trimmed haplotypes
final List<Haplotype> trimmedHaplotypes = new ArrayList<Haplotype>(haplotypeSet);
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(haplotypeSet);
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
@ -816,7 +832,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// trim down the reads and add them to the trimmed active region
final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size());
for( final GATKSAMRecord read : originalActiveRegion.getReads() ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
@ -937,7 +953,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final Collection<GATKSAMRecord> reads ) {
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<String, List<GATKSAMRecord>>();
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<>();
for( final String sample : samplesList) {
List<GATKSAMRecord> readList = returnMap.get( sample );
if( readList == null ) {

View File

@ -74,7 +74,6 @@ import java.util.*;
public class LikelihoodCalculationEngine {
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
private static final double LOG_ONE_HALF = -Math.log10(2.0);
private final byte constantGCP;
private final double log10globalReadMismappingRate;
private final boolean DEBUG;
@ -299,7 +298,7 @@ public class LikelihoodCalculationEngine {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF );
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
}
}
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
@ -397,11 +396,11 @@ public class LikelihoodCalculationEngine {
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
// all of the haplotypes that at least one sample called as one of the most likely
final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
final Set<Haplotype> selectedHaplotypes = new HashSet<>();
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
// our annoying map from allele -> haplotype
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
for ( final Haplotype h : haplotypes ) {
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
allele2Haplotype.put(Allele.create(h, h.isReference()), h);

View File

@ -111,7 +111,11 @@ public abstract class LocalAssemblyEngine {
* @param refHaplotype the reference haplotype
* @return a non-null list of reads
*/
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype);
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype, List<Haplotype> activeAlleleHaplotypes);
protected List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype) {
return assemble(reads, refHaplotype, Collections.<Haplotype>emptyList());
}
/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
@ -128,8 +132,11 @@ public abstract class LocalAssemblyEngine {
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
// create the list of artificial haplotypes that should be added to the graph for GGA mode
final List<Haplotype> activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc());
// create the graphs by calling our subclass assemble method
final List<SeqGraph> graphs = assemble(activeRegion.getReads(), refHaplotype);
final List<SeqGraph> graphs = assemble(activeRegion.getReads(), refHaplotype, activeAlleleHaplotypes);
// do some QC on the graphs
for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
@ -138,45 +145,53 @@ public abstract class LocalAssemblyEngine {
if ( graphWriter != null ) { printGraphs(graphs); }
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
return findBestPaths( graphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
}
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
@Ensures({"result.contains(refHaplotype)"})
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<Haplotype>();
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
returnHaplotypes.add( refHaplotype );
/**
* Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype
* @param refHaplotype the reference haplotype
* @param activeAllelesToGenotype the list of alternate alleles in VariantContexts
* @param activeRegionWindow the window containing the reference haplotype
* @return a non-null list of haplotypes
*/
private List<Haplotype> createActiveAlleleHaplotypes(final Haplotype refHaplotype, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
// for GGA mode, add the desired allele into the haplotype
for( final VariantContext compVC : activeAllelesToGenotype ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype
returnHaplotypes.add(insertedRefHaplotype);
}
}
}
return new ArrayList<>(returnHaplotypes);
}
@Ensures({"result.contains(refHaplotype)"})
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
returnHaplotypes.add( refHaplotype );
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
for( final SeqGraph graph : graphs ) {
final SeqVertex source = graph.getReferenceSourceVertex();
final SeqVertex sink = graph.getReferenceSinkVertex();
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>(allowCyclesInKmerGraphToGeneratePaths);
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(allowCyclesInKmerGraphToGeneratePaths);
for ( final Path<SeqVertex,BaseEdge> path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) {
// logger.info("Found path " + path);
Haplotype h = new Haplotype( path.getBases() );
if( !returnHaplotypes.contains(h) ) {
final Cigar cigar = path.calculateCigar(refHaplotype.getBases());
if ( cigar == null ) {
// couldn't produce a meaningful alignment of haplotype to reference, fail quitely
// couldn't produce a meaningful alignment of haplotype to reference, fail quietly
continue;
} else if( cigar.isEmpty() ) {
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() +
@ -197,25 +212,6 @@ public abstract class LocalAssemblyEngine {
if ( debug )
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
// This if statement used to additionally have:
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
// a haplotype that already contains a 1bp insertion (so practically it is reference but
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
if( vcOnHaplotype == null ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
}
}
}
}
}
}
}
@ -238,7 +234,7 @@ public abstract class LocalAssemblyEngine {
}
}
return new ArrayList<Haplotype>(returnHaplotypes);
return new ArrayList<>(returnHaplotypes);
}
/**
@ -256,71 +252,6 @@ public abstract class LocalAssemblyEngine {
return false;
}
/**
* Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
* Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
* This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
* @param haplotype the candidate haplotype
* @param ref the reference bases to align against
* @param haplotypeList the current list of haplotypes
* @param activeRegionStart the start of the active region in the reference byte array
* @param activeRegionStop the stop of the active region in the reference byte array
* @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
* @return true if the candidate haplotype was successfully incorporated into the haplotype list
*/
@Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final Set<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
if( haplotype == null ) { return false; }
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
return false;
}
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
}
byte[] newHaplotypeBases;
// extend partial haplotypes to contain the full active region sequence
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll(ArrayUtils.addAll(ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
haplotype.getBases()),
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop));
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
} else {
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
}
final Haplotype h = new Haplotype( newHaplotypeBases );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
if ( haplotype.isArtificialHaplotype() ) {
h.setArtificialEvent(haplotype.getArtificialEvent());
}
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
return false;
}
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
haplotypeList.add(h);
return true;
} else {
return false;
}
}
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
@ -372,7 +303,6 @@ public abstract class LocalAssemblyEngine {
* Perform general QC on the graph to make sure something hasn't gone wrong during assembly
* @param graph the graph to check
* @param refHaplotype the reference haplotype
* @param <T>
*/
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
sanityCheckReferenceGraph(graph, refHaplotype);
@ -383,7 +313,6 @@ public abstract class LocalAssemblyEngine {
*
* @param graph the graph to check
* @param refHaplotype the reference haplotype
* @param <T>
*/
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckReferenceGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
if( graph.getReferenceSourceVertex() == null ) {

View File

@ -62,6 +62,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class);
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
/** The min and max kmer sizes to try when building the graph. */
private final List<Integer> kmerSizes;
@ -88,7 +89,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
}
@Override
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final List<SeqGraph> graphs = new LinkedList<>();
for ( final int kmerSize : kmerSizes ) {
@ -96,6 +97,12 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
int hapCount = 0;
for( final Haplotype h : activeAlleleHaplotypes ) {
final int[] counts = new int[h.length()];
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
}
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {

View File

@ -590,11 +590,9 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
}
/**
* Get the start and stop positions (exclusive) of the longest stretch of high quality bases
* in read
* Get the longest stretch of high quality bases in read and pass that sequence to the graph
*
* @param read a non-null read
* @return the start and stop for high quality bases in read, or null if none exist
*/
protected void addRead(final GATKSAMRecord read) {
final byte[] sequence = read.getReadBases();

View File

@ -78,8 +78,6 @@ public class PairHMMIndelErrorModel {
private static final double baseMatchArray[];
private static final double baseMismatchArray[];
private final static double LOG_ONE_HALF;
private static final int START_HRUN_GAP_IDX = 4;
private static final int MAX_HRUN_GAP_IDX = 20;
@ -97,8 +95,6 @@ public class PairHMMIndelErrorModel {
/////////////////////////////
static {
LOG_ONE_HALF= -Math.log10(2.0);
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
@ -466,7 +462,7 @@ public class PairHMMIndelErrorModel {
final double li = readLikelihoods[readIdx][i];
final double lj = readLikelihoods[readIdx][j];
final int readCount = readCounts[readIdx];
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF);
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF);
}
}
}

View File

@ -73,8 +73,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
public void testReferenceCycleGraph() {
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true));
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true));
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), Collections.<Haplotype>emptyList());
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), Collections.<Haplotype>emptyList());
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
@ -147,7 +147,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
}
}
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
assembler.addReadKmersToGraph(builder, Arrays.asList(read), Collections.<Haplotype>emptyList());
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);

View File

@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"008029ee34e1becd8312e3c4d608033c");
"38b4596c3910fdde51ea59aa1a8f848f");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"ae8d95ffe77515cc74a55c2afd142826");
"08147870d73d9749ced8cfc7cdd4714f");
}
}

View File

@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"bb30d0761dc9e2dfd57bfe07b72d06d8");
"ffd69c410dca0d2f9fe75f3cb5d08179");
}
@Test

View File

@ -47,6 +47,9 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
@ -216,6 +219,10 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
private List<Haplotype> assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
final Haplotype refHaplotype = new Haplotype(refBases, true);
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0);
activeRegion.addAll(reads);
final LocalAssemblyEngine engine = createAssembler(assembler);

View File

@ -85,7 +85,7 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
public SeqGraph assemble() {
assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
assembler.setDebugGraphTransformations(true);
final SeqGraph graph = assembler.assemble(reads, refHaplotype).get(0);
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
return graph;
}

View File

@ -55,17 +55,19 @@ public class MathUtils {
private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP;
private static final double MAX_JACOBIAN_TOLERANCE = 8.0;
private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1;
private static final int MAXN = 70000;
private static final int MAXN = 70_000;
private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
/**
* The smallest log10 value we'll emit from normalizeFromLog10 and other functions
* where the real-space value is 0.0.
*/
public final static double LOG10_P_OF_ZERO = -1000000.0;
public final static double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5);
private final static double NATURAL_LOG_OF_TEN = Math.log(10.0);
private final static double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI);
public static final double LOG10_P_OF_ZERO = -1000000.0;
public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5);
public static final double LOG_ONE_HALF = -Math.log10(2.0);
public static final double LOG_ONE_THIRD = -Math.log10(3.0);
private static final double NATURAL_LOG_OF_TEN = Math.log(10.0);
private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI);
static {
log10Cache = new double[LOG10_CACHE_SIZE];

View File

@ -221,7 +221,7 @@ public class PerReadAlleleLikelihoodMap {
final int count = ReadUtils.getMeanRepresentativeReadCount(read);
final double likelihood_iii = entry.getValue().get(iii_allele);
final double likelihood_jjj = entry.getValue().get(jjj_allele);
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF);
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF);
// fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair
if ( haplotypeLikelihood < maxElement ) break;
@ -241,7 +241,6 @@ public class PerReadAlleleLikelihoodMap {
return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement);
}
private static final double LOG_ONE_HALF = -Math.log10(2.0);
/**
* Given a map from alleles to likelihoods, find the allele with the largest likelihood.

View File

@ -46,7 +46,6 @@ public class Haplotype extends Allele {
private EventMap eventMap = null;
private Cigar cigar;
private int alignmentStartHapwrtRef;
private Event artificialEvent = null;
private double score = 0;
/**
@ -93,11 +92,6 @@ public class Haplotype extends Allele {
super(allele, true);
}
protected Haplotype( final byte[] bases, final Event artificialEvent ) {
this(bases, false);
this.artificialEvent = artificialEvent;
}
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
this(bases, false);
this.genomeLocation = loc;
@ -189,7 +183,7 @@ public class Haplotype extends Allele {
}
/**
* Get the cigar for this haplotype. Note that cigar is guarenteed to be consolidated
* Get the cigar for this haplotype. Note that the cigar is guaranteed to be consolidated
* in that multiple adjacent equal operates will have been merged
* @return the cigar of this haplotype
*/
@ -223,30 +217,6 @@ public class Haplotype extends Allele {
throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength());
}
public boolean isArtificialHaplotype() {
return artificialEvent != null;
}
public Event getArtificialEvent() {
return artificialEvent;
}
public Allele getArtificialRefAllele() {
return artificialEvent.ref;
}
public Allele getArtificialAltAllele() {
return artificialEvent.alt;
}
public int getArtificialAllelePosition() {
return artificialEvent.pos;
}
public void setArtificialEvent( final Event artificialEvent ) {
this.artificialEvent = artificialEvent;
}
@Requires({"refInsertLocation >= 0"})
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
@ -260,7 +230,7 @@ public class Haplotype extends Allele {
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, 0, haplotypeInsertLocation)); // bases before the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, haplotypeInsertLocation + refAllele.length(), myBases.length)); // bases after the variant
return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation));
return new Haplotype(newHaplotypeBases);
}
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,