Merge pull request #249 from broadinstitute/rp_hc_gga_mode
New implementation of the GGA mode in the HaplotypeCaller
This commit is contained in:
commit
a96f48bc39
|
|
@ -77,6 +77,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
private final static int NUM_PATHS_PER_GRAPH = 25;
|
||||
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
|
||||
private static final int GRAPH_KMER_STEP = 6;
|
||||
private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
||||
|
||||
private final int minKmer;
|
||||
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
|
||||
|
|
@ -92,8 +93,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
|
||||
final List<SeqGraph> graphs = new LinkedList<SeqGraph>();
|
||||
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
||||
final List<SeqGraph> graphs = new LinkedList<>();
|
||||
|
||||
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
|
||||
if( maxKmer < minKmer) {
|
||||
|
|
@ -106,7 +107,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
continue;
|
||||
|
||||
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
|
||||
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype);
|
||||
DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
|
||||
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
|
||||
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
|
||||
|
|
@ -133,7 +134,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype ) {
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
||||
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
|
||||
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
|
||||
|
||||
|
|
@ -142,8 +143,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it
|
||||
if ( ! addReadKmersToGraph(builder, reads) )
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it as well as the artificial GGA haplotypes
|
||||
if ( ! addReadKmersToGraph(builder, reads, activeAlleleHaplotypes) )
|
||||
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
||||
return null;
|
||||
|
||||
|
|
@ -156,11 +157,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
*
|
||||
* @param builder a debruijn graph builder to add the read kmers to
|
||||
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
||||
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
|
||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||
*/
|
||||
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
|
||||
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads, final List<Haplotype> activeAlleleHaplotypes) {
|
||||
final int kmerLength = builder.getKmerSize();
|
||||
|
||||
// First pull kmers out of the artificial GGA haplotypes and throw them on the graph
|
||||
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
|
||||
final int end = haplotype.length() - kmerLength;
|
||||
for( int start = 0; start < end; start++ ) {
|
||||
builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
|
||||
}
|
||||
}
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ public class GenotypingEngine {
|
|||
|
||||
private final boolean DEBUG;
|
||||
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static List<Allele> noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final VariantAnnotatorEngine annotationEngine;
|
||||
private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
||||
|
||||
|
|
@ -162,8 +162,8 @@ public class GenotypingEngine {
|
|||
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<>();
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||
|
|
@ -183,7 +183,7 @@ public class GenotypingEngine {
|
|||
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
|
||||
throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
|
||||
}
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<VariantContext, Allele>();
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
||||
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
||||
for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) {
|
||||
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
|
||||
|
|
@ -244,7 +244,7 @@ public class GenotypingEngine {
|
|||
|
||||
if ( in_GGA_mode ) startPosKeySet.clear();
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
//cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently
|
||||
if ( !in_GGA_mode ) {
|
||||
// run the event merger if we're not in GGA mode
|
||||
final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc);
|
||||
|
|
@ -267,7 +267,7 @@ public class GenotypingEngine {
|
|||
* @return the list of the sources of vcs in the same order
|
||||
*/
|
||||
private List<String> makePriorityList(final List<VariantContext> vcs) {
|
||||
final List<String> priorityList = new LinkedList<String>();
|
||||
final List<String> priorityList = new LinkedList<>();
|
||||
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
|
||||
return priorityList;
|
||||
}
|
||||
|
|
@ -276,7 +276,7 @@ public class GenotypingEngine {
|
|||
final int loc,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
// the overlapping events to merge into a common reference view
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<>();
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
|
|
@ -292,7 +292,7 @@ public class GenotypingEngine {
|
|||
if( compVC.getStart() == loc ) {
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
List<Allele> alleleSet = new ArrayList<Allele>(2);
|
||||
List<Allele> alleleSet = new ArrayList<>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
|
||||
|
|
@ -348,7 +348,7 @@ public class GenotypingEngine {
|
|||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final VariantContext call ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call);
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
|
@ -384,7 +384,7 @@ public class GenotypingEngine {
|
|||
// TODO - split into input haplotypes and output haplotypes as not to share I/O arguments
|
||||
@Requires("haplotypes != null")
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
final List<Haplotype> haplotypesToRemove = new ArrayList<>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
|
|
@ -407,7 +407,7 @@ public class GenotypingEngine {
|
|||
final Map<Allele, List<Haplotype>> alleleMapper,
|
||||
final double downsamplingFraction ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<>();
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
|
||||
|
|
@ -430,7 +430,7 @@ public class GenotypingEngine {
|
|||
}
|
||||
|
||||
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) {
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<Allele, List<Haplotype>>();
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<>();
|
||||
for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) {
|
||||
alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey())));
|
||||
}
|
||||
|
|
@ -441,100 +441,33 @@ public class GenotypingEngine {
|
|||
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
|
||||
protected static Map<Event, List<Haplotype>> createEventMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
|
||||
|
||||
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<Event, List<Haplotype>>(eventsAtThisLoc.size()+1);
|
||||
VariantContext refVC = eventsAtThisLoc.get(0); // the genome loc is the only safe thing to pull out of this VC because ref/alt pairs might change reference basis
|
||||
eventMapper.put(new Event(null), new ArrayList<Haplotype>());
|
||||
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1);
|
||||
final Event refEvent = new Event(null);
|
||||
eventMapper.put(refEvent, new ArrayList<Haplotype>());
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
eventMapper.put(new Event(vc), new ArrayList<Haplotype>());
|
||||
}
|
||||
|
||||
final List<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() ) {
|
||||
final List<Allele> alleles = new ArrayList<Allele>(2);
|
||||
alleles.add(h.getArtificialRefAllele());
|
||||
alleles.add(h.getArtificialAltAllele());
|
||||
final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype")
|
||||
.alleles(alleles)
|
||||
.loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() );
|
||||
if( eventMapper.containsKey(artificialVC) ) {
|
||||
eventMapper.get(artificialVC).add(h);
|
||||
}
|
||||
} else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later
|
||||
undeterminedHaplotypes.add(h);
|
||||
if( h.getEventMap().get(loc) == null ) {
|
||||
eventMapper.get(refEvent).add(h);
|
||||
} else {
|
||||
boolean haplotypeIsDetermined = false;
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
eventMapper.get(new Event(vcAtThisLoc)).add(h);
|
||||
haplotypeIsDetermined = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( !haplotypeIsDetermined )
|
||||
undeterminedHaplotypes.add(h);
|
||||
}
|
||||
}
|
||||
|
||||
for( final Haplotype h : undeterminedHaplotypes ) {
|
||||
Event matchingEvent = new Event(null);
|
||||
for( final Map.Entry<Event, List<Haplotype>> eventToTest : eventMapper.entrySet() ) {
|
||||
// don't test against the reference allele
|
||||
if( eventToTest.getKey().equals(new Event(null)) )
|
||||
continue;
|
||||
|
||||
// only try to disambiguate for alleles that have had haplotypes previously assigned above
|
||||
if( eventToTest.getValue().isEmpty() )
|
||||
continue;
|
||||
|
||||
final Haplotype artificialHaplotype = eventToTest.getValue().get(0);
|
||||
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
|
||||
matchingEvent = eventToTest.getKey();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
eventMapper.get(matchingEvent).add(h);
|
||||
}
|
||||
|
||||
return eventMapper;
|
||||
}
|
||||
|
||||
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
|
||||
|
||||
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
|
||||
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
|
||||
if ( fromSuperset == null )
|
||||
return false;
|
||||
|
||||
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
|
||||
if ( resolveSupersetToSubset )
|
||||
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
|
||||
|
||||
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
|
||||
if ( targetReference.length() <= actualReference.length() )
|
||||
return currentAlleles;
|
||||
|
||||
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
|
||||
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
|
||||
for ( final Allele a : currentAlleles ) {
|
||||
newAlleles.add(Allele.extend(a, extraBases));
|
||||
}
|
||||
return newAlleles;
|
||||
}
|
||||
|
||||
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
|
||||
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final List<List<Haplotype>> alleleMapper, final List<Haplotype> haplotypes ) {
|
||||
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
|
||||
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
|
||||
final List<Allele> eventAllelesForSample = new ArrayList<>();
|
||||
for( final Allele a : haplotypeAllelesForSample ) {
|
||||
final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a));
|
||||
for( int iii = 0; iii < alleleMapper.size(); iii++ ) {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -433,8 +436,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
private final static int MIN_READ_LENGTH = 10;
|
||||
|
||||
private List<String> samplesList = new ArrayList<String>();
|
||||
private final static double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
|
||||
private final List<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
|
||||
|
||||
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
|
|
@ -603,7 +604,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// if we don't have any data, just abort early
|
||||
return new ActivityProfileState(ref.getLocus(), 0.0);
|
||||
|
||||
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
final List<Allele> noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
||||
|
|
@ -625,14 +626,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
}
|
||||
genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
|
||||
genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
|
||||
genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
|
||||
genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
|
||||
genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
||||
}
|
||||
}
|
||||
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final List<Allele> alleles = new ArrayList<>();
|
||||
alleles.add( FAKE_REF_ALLELE );
|
||||
alleles.add( FAKE_ALT_ALLELE );
|
||||
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
|
|
@ -746,9 +747,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
|
||||
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
|
||||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
|
||||
|
|
@ -760,6 +761,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to create the reference haplotype out of the active region and a padded loc
|
||||
* @param activeRegion the active region from which to generate the reference haplotype
|
||||
* @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be
|
||||
* @return a non-null haplotype
|
||||
*/
|
||||
private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) {
|
||||
final Haplotype refHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
|
||||
refHaplotype.setAlignmentStartHapwrtRef(activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart());
|
||||
final Cigar c = new Cigar();
|
||||
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
|
||||
refHaplotype.setCigar(c);
|
||||
return refHaplotype;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim down the active region to just enough to properly genotype the events among the haplotypes
|
||||
*
|
||||
|
|
@ -791,7 +807,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// trim down the haplotypes
|
||||
final Set<Haplotype> haplotypeSet = new HashSet<Haplotype>(haplotypes.size());
|
||||
final Set<Haplotype> haplotypeSet = new HashSet<>(haplotypes.size());
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
|
||||
if ( trimmed != null ) {
|
||||
|
|
@ -802,7 +818,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// create the final list of trimmed haplotypes
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<Haplotype>(haplotypeSet);
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(haplotypeSet);
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
|
||||
|
|
@ -816,7 +832,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
|
||||
// trim down the reads and add them to the trimmed active region
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size());
|
||||
for( final GATKSAMRecord read : originalActiveRegion.getReads() ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
|
||||
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
|
|
@ -937,7 +953,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final Collection<GATKSAMRecord> reads ) {
|
||||
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<String, List<GATKSAMRecord>>();
|
||||
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<>();
|
||||
for( final String sample : samplesList) {
|
||||
List<GATKSAMRecord> readList = returnMap.get( sample );
|
||||
if( readList == null ) {
|
||||
|
|
|
|||
|
|
@ -74,7 +74,6 @@ import java.util.*;
|
|||
public class LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final byte constantGCP;
|
||||
private final double log10globalReadMismappingRate;
|
||||
private final boolean DEBUG;
|
||||
|
|
@ -299,7 +298,7 @@ public class LikelihoodCalculationEngine {
|
|||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF );
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
|
|
@ -397,11 +396,11 @@ public class LikelihoodCalculationEngine {
|
|||
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
|
||||
|
||||
// all of the haplotypes that at least one sample called as one of the most likely
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<>();
|
||||
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
|
||||
|
||||
// our annoying map from allele -> haplotype
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
|
||||
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
|
||||
|
|
|
|||
|
|
@ -111,7 +111,11 @@ public abstract class LocalAssemblyEngine {
|
|||
* @param refHaplotype the reference haplotype
|
||||
* @return a non-null list of reads
|
||||
*/
|
||||
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype);
|
||||
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype, List<Haplotype> activeAlleleHaplotypes);
|
||||
|
||||
protected List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype) {
|
||||
return assemble(reads, refHaplotype, Collections.<Haplotype>emptyList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
|
||||
|
|
@ -128,8 +132,11 @@ public abstract class LocalAssemblyEngine {
|
|||
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
|
||||
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// create the list of artificial haplotypes that should be added to the graph for GGA mode
|
||||
final List<Haplotype> activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc());
|
||||
|
||||
// create the graphs by calling our subclass assemble method
|
||||
final List<SeqGraph> graphs = assemble(activeRegion.getReads(), refHaplotype);
|
||||
final List<SeqGraph> graphs = assemble(activeRegion.getReads(), refHaplotype, activeAlleleHaplotypes);
|
||||
|
||||
// do some QC on the graphs
|
||||
for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
|
||||
|
|
@ -138,45 +145,53 @@ public abstract class LocalAssemblyEngine {
|
|||
if ( graphWriter != null ) { printGraphs(graphs); }
|
||||
|
||||
// find the best paths in the graphs and return them as haplotypes
|
||||
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
|
||||
return findBestPaths( graphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
|
||||
}
|
||||
|
||||
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<Haplotype>();
|
||||
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
|
||||
final Cigar c = new Cigar();
|
||||
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
|
||||
refHaplotype.setCigar(c);
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
||||
/**
|
||||
* Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype
|
||||
* @param refHaplotype the reference haplotype
|
||||
* @param activeAllelesToGenotype the list of alternate alleles in VariantContexts
|
||||
* @param activeRegionWindow the window containing the reference haplotype
|
||||
* @return a non-null list of haplotypes
|
||||
*/
|
||||
private List<Haplotype> createActiveAlleleHaplotypes(final Haplotype refHaplotype, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
|
||||
addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
|
||||
if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype
|
||||
returnHaplotypes.add(insertedRefHaplotype);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new ArrayList<>(returnHaplotypes);
|
||||
}
|
||||
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
final SeqVertex source = graph.getReferenceSourceVertex();
|
||||
final SeqVertex sink = graph.getReferenceSinkVertex();
|
||||
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
|
||||
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>(allowCyclesInKmerGraphToGeneratePaths);
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(allowCyclesInKmerGraphToGeneratePaths);
|
||||
for ( final Path<SeqVertex,BaseEdge> path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) {
|
||||
// logger.info("Found path " + path);
|
||||
Haplotype h = new Haplotype( path.getBases() );
|
||||
if( !returnHaplotypes.contains(h) ) {
|
||||
final Cigar cigar = path.calculateCigar(refHaplotype.getBases());
|
||||
|
||||
if ( cigar == null ) {
|
||||
// couldn't produce a meaningful alignment of haplotype to reference, fail quitely
|
||||
// couldn't produce a meaningful alignment of haplotype to reference, fail quietly
|
||||
continue;
|
||||
} else if( cigar.isEmpty() ) {
|
||||
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() +
|
||||
|
|
@ -197,25 +212,6 @@ public abstract class LocalAssemblyEngine {
|
|||
|
||||
if ( debug )
|
||||
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( !activeAllelesToGenotype.isEmpty() ) {
|
||||
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
|
||||
// This if statement used to additionally have:
|
||||
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
|
||||
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
|
||||
// a haplotype that already contains a 1bp insertion (so practically it is reference but
|
||||
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
|
||||
if( vcOnHaplotype == null ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -238,7 +234,7 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
return new ArrayList<Haplotype>(returnHaplotypes);
|
||||
return new ArrayList<>(returnHaplotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -256,71 +252,6 @@ public abstract class LocalAssemblyEngine {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
|
||||
* Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
|
||||
* This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
|
||||
* @param haplotype the candidate haplotype
|
||||
* @param ref the reference bases to align against
|
||||
* @param haplotypeList the current list of haplotypes
|
||||
* @param activeRegionStart the start of the active region in the reference byte array
|
||||
* @param activeRegionStop the stop of the active region in the reference byte array
|
||||
* @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
|
||||
* @return true if the candidate haplotype was successfully incorporated into the haplotype list
|
||||
*/
|
||||
@Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
|
||||
private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final Set<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
|
||||
if( haplotype == null ) { return false; }
|
||||
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
|
||||
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
|
||||
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
|
||||
return false;
|
||||
}
|
||||
|
||||
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
|
||||
|
||||
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
|
||||
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
|
||||
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
|
||||
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
|
||||
}
|
||||
byte[] newHaplotypeBases;
|
||||
// extend partial haplotypes to contain the full active region sequence
|
||||
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll(ArrayUtils.addAll(ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
|
||||
haplotype.getBases()),
|
||||
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop));
|
||||
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
|
||||
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
|
||||
} else {
|
||||
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
|
||||
}
|
||||
|
||||
final Haplotype h = new Haplotype( newHaplotypeBases );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
|
||||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
if ( haplotype.isArtificialHaplotype() ) {
|
||||
h.setArtificialEvent(haplotype.getArtificialEvent());
|
||||
}
|
||||
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
|
||||
return false;
|
||||
}
|
||||
|
||||
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
|
||||
|
||||
if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
|
||||
haplotypeList.add(h);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
|
||||
|
||||
|
|
@ -372,7 +303,6 @@ public abstract class LocalAssemblyEngine {
|
|||
* Perform general QC on the graph to make sure something hasn't gone wrong during assembly
|
||||
* @param graph the graph to check
|
||||
* @param refHaplotype the reference haplotype
|
||||
* @param <T>
|
||||
*/
|
||||
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
|
||||
sanityCheckReferenceGraph(graph, refHaplotype);
|
||||
|
|
@ -383,7 +313,6 @@ public abstract class LocalAssemblyEngine {
|
|||
*
|
||||
* @param graph the graph to check
|
||||
* @param refHaplotype the reference haplotype
|
||||
* @param <T>
|
||||
*/
|
||||
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckReferenceGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
|
||||
if( graph.getReferenceSourceVertex() == null ) {
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class);
|
||||
|
||||
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
|
||||
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
||||
|
||||
/** The min and max kmer sizes to try when building the graph. */
|
||||
private final List<Integer> kmerSizes;
|
||||
|
|
@ -88,7 +89,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
@Override
|
||||
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
|
||||
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
||||
final List<SeqGraph> graphs = new LinkedList<>();
|
||||
|
||||
for ( final int kmerSize : kmerSizes ) {
|
||||
|
|
@ -96,6 +97,12 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
|
||||
// add the reference sequence to the graph
|
||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||
int hapCount = 0;
|
||||
for( final Haplotype h : activeAlleleHaplotypes ) {
|
||||
final int[] counts = new int[h.length()];
|
||||
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
||||
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
||||
}
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
|
|
|
|||
|
|
@ -590,11 +590,9 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the start and stop positions (exclusive) of the longest stretch of high quality bases
|
||||
* in read
|
||||
* Get the longest stretch of high quality bases in read and pass that sequence to the graph
|
||||
*
|
||||
* @param read a non-null read
|
||||
* @return the start and stop for high quality bases in read, or null if none exist
|
||||
*/
|
||||
protected void addRead(final GATKSAMRecord read) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
|
|
|
|||
|
|
@ -78,8 +78,6 @@ public class PairHMMIndelErrorModel {
|
|||
private static final double baseMatchArray[];
|
||||
private static final double baseMismatchArray[];
|
||||
|
||||
private final static double LOG_ONE_HALF;
|
||||
|
||||
private static final int START_HRUN_GAP_IDX = 4;
|
||||
private static final int MAX_HRUN_GAP_IDX = 20;
|
||||
|
||||
|
|
@ -97,8 +95,6 @@ public class PairHMMIndelErrorModel {
|
|||
/////////////////////////////
|
||||
|
||||
static {
|
||||
LOG_ONE_HALF= -Math.log10(2.0);
|
||||
|
||||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||
|
|
@ -466,7 +462,7 @@ public class PairHMMIndelErrorModel {
|
|||
final double li = readLikelihoods[readIdx][i];
|
||||
final double lj = readLikelihoods[readIdx][j];
|
||||
final int readCount = readCounts[readIdx];
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF);
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -73,8 +73,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
|||
public void testReferenceCycleGraph() {
|
||||
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
|
||||
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
|
||||
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true));
|
||||
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true));
|
||||
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), Collections.<Haplotype>emptyList());
|
||||
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), Collections.<Haplotype>emptyList());
|
||||
|
||||
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
|
||||
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
|
||||
|
|
@ -147,7 +147,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
|
||||
assembler.addReadKmersToGraph(builder, Arrays.asList(read), Collections.<Haplotype>emptyList());
|
||||
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
||||
for ( final Kmer addedKmer : builder.addedPairs ) {
|
||||
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||
|
|
|
|||
|
|
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
||||
"008029ee34e1becd8312e3c4d608033c");
|
||||
"38b4596c3910fdde51ea59aa1a8f848f");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"ae8d95ffe77515cc74a55c2afd142826");
|
||||
"08147870d73d9749ced8cfc7cdd4714f");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"bb30d0761dc9e2dfd57bfe07b72d06d8");
|
||||
"ffd69c410dca0d2f9fe75f3cb5d08179");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -47,6 +47,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
|
||||
|
|
@ -216,6 +219,10 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
|
|||
|
||||
private List<Haplotype> assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
|
||||
final Haplotype refHaplotype = new Haplotype(refBases, true);
|
||||
final Cigar c = new Cigar();
|
||||
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
|
||||
refHaplotype.setCigar(c);
|
||||
|
||||
final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0);
|
||||
activeRegion.addAll(reads);
|
||||
final LocalAssemblyEngine engine = createAssembler(assembler);
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
|
|||
public SeqGraph assemble() {
|
||||
assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
|
||||
assembler.setDebugGraphTransformations(true);
|
||||
final SeqGraph graph = assembler.assemble(reads, refHaplotype).get(0);
|
||||
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
|
||||
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
|
||||
return graph;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,17 +55,19 @@ public class MathUtils {
|
|||
private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP;
|
||||
private static final double MAX_JACOBIAN_TOLERANCE = 8.0;
|
||||
private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1;
|
||||
private static final int MAXN = 70000;
|
||||
private static final int MAXN = 70_000;
|
||||
private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
|
||||
|
||||
/**
|
||||
* The smallest log10 value we'll emit from normalizeFromLog10 and other functions
|
||||
* where the real-space value is 0.0.
|
||||
*/
|
||||
public final static double LOG10_P_OF_ZERO = -1000000.0;
|
||||
public final static double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5);
|
||||
private final static double NATURAL_LOG_OF_TEN = Math.log(10.0);
|
||||
private final static double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI);
|
||||
public static final double LOG10_P_OF_ZERO = -1000000.0;
|
||||
public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5);
|
||||
public static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
public static final double LOG_ONE_THIRD = -Math.log10(3.0);
|
||||
private static final double NATURAL_LOG_OF_TEN = Math.log(10.0);
|
||||
private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI);
|
||||
|
||||
static {
|
||||
log10Cache = new double[LOG10_CACHE_SIZE];
|
||||
|
|
|
|||
|
|
@ -221,7 +221,7 @@ public class PerReadAlleleLikelihoodMap {
|
|||
final int count = ReadUtils.getMeanRepresentativeReadCount(read);
|
||||
final double likelihood_iii = entry.getValue().get(iii_allele);
|
||||
final double likelihood_jjj = entry.getValue().get(jjj_allele);
|
||||
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF);
|
||||
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF);
|
||||
|
||||
// fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair
|
||||
if ( haplotypeLikelihood < maxElement ) break;
|
||||
|
|
@ -241,7 +241,6 @@ public class PerReadAlleleLikelihoodMap {
|
|||
|
||||
return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement);
|
||||
}
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
|
||||
/**
|
||||
* Given a map from alleles to likelihoods, find the allele with the largest likelihood.
|
||||
|
|
|
|||
|
|
@ -46,7 +46,6 @@ public class Haplotype extends Allele {
|
|||
private EventMap eventMap = null;
|
||||
private Cigar cigar;
|
||||
private int alignmentStartHapwrtRef;
|
||||
private Event artificialEvent = null;
|
||||
private double score = 0;
|
||||
|
||||
/**
|
||||
|
|
@ -93,11 +92,6 @@ public class Haplotype extends Allele {
|
|||
super(allele, true);
|
||||
}
|
||||
|
||||
protected Haplotype( final byte[] bases, final Event artificialEvent ) {
|
||||
this(bases, false);
|
||||
this.artificialEvent = artificialEvent;
|
||||
}
|
||||
|
||||
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
|
||||
this(bases, false);
|
||||
this.genomeLocation = loc;
|
||||
|
|
@ -189,7 +183,7 @@ public class Haplotype extends Allele {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the cigar for this haplotype. Note that cigar is guarenteed to be consolidated
|
||||
* Get the cigar for this haplotype. Note that the cigar is guaranteed to be consolidated
|
||||
* in that multiple adjacent equal operates will have been merged
|
||||
* @return the cigar of this haplotype
|
||||
*/
|
||||
|
|
@ -223,30 +217,6 @@ public class Haplotype extends Allele {
|
|||
throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength());
|
||||
}
|
||||
|
||||
public boolean isArtificialHaplotype() {
|
||||
return artificialEvent != null;
|
||||
}
|
||||
|
||||
public Event getArtificialEvent() {
|
||||
return artificialEvent;
|
||||
}
|
||||
|
||||
public Allele getArtificialRefAllele() {
|
||||
return artificialEvent.ref;
|
||||
}
|
||||
|
||||
public Allele getArtificialAltAllele() {
|
||||
return artificialEvent.alt;
|
||||
}
|
||||
|
||||
public int getArtificialAllelePosition() {
|
||||
return artificialEvent.pos;
|
||||
}
|
||||
|
||||
public void setArtificialEvent( final Event artificialEvent ) {
|
||||
this.artificialEvent = artificialEvent;
|
||||
}
|
||||
|
||||
@Requires({"refInsertLocation >= 0"})
|
||||
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
|
||||
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
|
||||
|
|
@ -260,7 +230,7 @@ public class Haplotype extends Allele {
|
|||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, 0, haplotypeInsertLocation)); // bases before the variant
|
||||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
|
||||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, haplotypeInsertLocation + refAllele.length(), myBases.length)); // bases after the variant
|
||||
return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation));
|
||||
return new Haplotype(newHaplotypeBases);
|
||||
}
|
||||
|
||||
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
|
||||
|
|
|
|||
Loading…
Reference in New Issue