Merge pull request #129 from broadinstitute/md_hc_good_head_tail
Many more improvements and bugfixes to the HaplotypeCaller
This commit is contained in:
commit
e58d0a71e4
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -90,13 +91,13 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot
|
|||
}
|
||||
|
||||
for (Map<Allele,Double> el : alleleLikelihoodMap.getLikelihoodMapValues()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el);
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el);
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
refQuals.add(-10.0*(double)el.get(a));
|
||||
else if (allAlleles.contains(a))
|
||||
altQuals.add(-10.0*(double)el.get(a));
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele()));
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele()));
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -84,12 +85,12 @@ public class ClippingRankSumTest extends RankSumTest {
|
|||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
|
||||
else if (allAlleles.contains(a))
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
|
||||
|
|
@ -141,12 +142,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
if (!vc.getAlleles().contains(a))
|
||||
if (!vc.getAlleles().contains(a.getMostLikelyAllele()))
|
||||
continue; // sanity check - shouldn't be needed
|
||||
alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
alleleCounts.put(a.getMostLikelyAllele(), alleleCounts.get(a.getMostLikelyAllele()) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
}
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference());
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
|
@ -273,10 +274,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount);
|
||||
updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -92,13 +93,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
|
|||
return;
|
||||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
// BUGBUG: There needs to be a comparable isUsableBase check here
|
||||
if (a.isNoCall())
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)el.getKey().getMappingQuality());
|
||||
else if (allAlleles.contains(a))
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)el.getKey().getMappingQuality());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
|
@ -107,8 +108,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
}
|
||||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
|
|
@ -123,9 +124,9 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
if (readPos > numAlignedBases / 2)
|
||||
readPos = numAlignedBases - (readPos + 1);
|
||||
|
||||
if (a.isReference())
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)readPos);
|
||||
else if (allAlleles.contains(a))
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)readPos);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -65,7 +66,6 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -83,7 +83,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
|
||||
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25;
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16;
|
||||
private static final int GRAPH_KMER_STEP = 6;
|
||||
|
||||
// Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode
|
||||
|
|
@ -94,30 +93,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
private final boolean debug;
|
||||
private final boolean debugGraphTransformations;
|
||||
private final PrintStream graphWriter;
|
||||
private final int minKmer;
|
||||
private final byte minBaseQualityToUseInAssembly;
|
||||
|
||||
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
|
||||
|
||||
private int PRUNE_FACTOR = 2;
|
||||
|
||||
protected DeBruijnAssembler() {
|
||||
this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE);
|
||||
this(false, -1, 11);
|
||||
}
|
||||
|
||||
public DeBruijnAssembler(final boolean debug,
|
||||
final int debugGraphTransformations,
|
||||
final PrintStream graphWriter,
|
||||
final int minKmer,
|
||||
final byte minBaseQualityToUseInAssembly) {
|
||||
final int minKmer) {
|
||||
super();
|
||||
this.debug = debug;
|
||||
this.debugGraphTransformations = debugGraphTransformations > 0;
|
||||
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
|
||||
this.graphWriter = graphWriter;
|
||||
this.minKmer = minKmer;
|
||||
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -126,19 +118,15 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
* @param refHaplotype reference haplotype object
|
||||
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
|
||||
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
|
||||
* @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value
|
||||
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
|
||||
* @return a non-empty list of all the haplotypes that are produced during assembly
|
||||
*/
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List<VariantContext> activeAllelesToGenotype ) {
|
||||
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype ) {
|
||||
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
|
||||
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
|
||||
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
|
||||
if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// set the pruning factor for this run of the assembly engine
|
||||
this.PRUNE_FACTOR = PRUNE_FACTOR;
|
||||
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// create the graphs
|
||||
final List<SeqGraph> graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
|
|
@ -167,17 +155,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
continue;
|
||||
|
||||
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
|
||||
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
|
||||
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype);
|
||||
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
|
||||
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR);
|
||||
graph = graph.errorCorrect();
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR);
|
||||
graph.cleanNonRefPaths();
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
|
||||
|
||||
if ( shouldErrorCorrectKmers() ) {
|
||||
graph = errorCorrect(graph);
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor);
|
||||
}
|
||||
|
||||
final SeqGraph seqGraph = toSeqGraph(graph);
|
||||
|
||||
if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference
|
||||
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
|
||||
sanityCheckReferenceGraph(seqGraph, refHaplotype);
|
||||
graphs.add(seqGraph);
|
||||
|
||||
|
|
@ -193,12 +183,31 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
|
||||
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR);
|
||||
seqGraph.pruneGraph(PRUNE_FACTOR);
|
||||
seqGraph.removeVerticesNotConnectedToRef();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
|
||||
|
||||
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
|
||||
seqGraph.zipLinearChains();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
|
||||
|
||||
// now go through and prune the graph, removing vertices no longer connected to the reference chain
|
||||
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
|
||||
// edges to maintain graph connectivity.
|
||||
seqGraph.pruneGraph(pruneFactor);
|
||||
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
|
||||
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
|
||||
seqGraph.simplifyGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
|
||||
|
||||
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
|
||||
// happen in cases where for example the reference somehow manages to acquire a cycle, or
|
||||
// where the entire assembly collapses back into the reference sequence.
|
||||
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
|
||||
return null;
|
||||
|
||||
seqGraph.removePathsNotConnectedToRef();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
|
||||
|
|
@ -217,64 +226,135 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
@Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"})
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
|
||||
|
||||
final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH);
|
||||
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype ) {
|
||||
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
|
||||
|
||||
// First pull kmers from the reference haplotype and add them to the graph
|
||||
//logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString());
|
||||
final byte[] refSequence = refHaplotype.getBases();
|
||||
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true, 1) ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( ! addReferenceKmersToGraph(graph, refHaplotype.getBases()) )
|
||||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it
|
||||
if ( ! addReadKmersToGraph(graph, reads) )
|
||||
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
||||
return null;
|
||||
|
||||
graph.cleanNonRefPaths();
|
||||
return graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the high-quality kmers from the reads to the graph
|
||||
*
|
||||
* @param graph a graph to add the read kmers to
|
||||
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||
*/
|
||||
protected boolean addReadKmersToGraph(final DeBruijnGraph graph, final List<GATKSAMRecord> reads) {
|
||||
final int kmerLength = graph.getKmerSize();
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
//if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue;
|
||||
//logger.info("Adding read " + read + " with sequence " + read.getReadString());
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
|
||||
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
if( sequence.length > kmerLength + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = sequence.length - kmerLength + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
// if the qualities of all the bases in the kmers are high enough
|
||||
boolean badKmer = false;
|
||||
for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) {
|
||||
for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) {
|
||||
if( qualities[jjj] < minBaseQualityToUseInAssembly ) {
|
||||
badKmer = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !badKmer ) {
|
||||
// how many observations of this kmer have we seen? A normal read counts for 1, but
|
||||
// a reduced read might imply a higher multiplicity for our the edge
|
||||
int countNumber = 1;
|
||||
if( read.isReducedRead() ) {
|
||||
// compute mean number of reduced read counts in current kmer span
|
||||
// precise rounding can make a difference with low consensus counts
|
||||
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH));
|
||||
// TODO -- optimization: should extend arrayMax function to take start stop values
|
||||
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength));
|
||||
}
|
||||
|
||||
final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH);
|
||||
final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH);
|
||||
|
||||
for( int kkk=0; kkk < countNumber; kkk++ ) {
|
||||
graph.addKmersToGraph(kmer1, kmer2, false, 1);
|
||||
}
|
||||
graph.addKmerPairFromSeqToGraph(sequence, iii, false, countNumber);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return graph;
|
||||
// always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the kmers from the reference sequence to the DeBruijnGraph
|
||||
*
|
||||
* @param graph the graph to add the reference kmers to. Must be empty
|
||||
* @param refSequence the reference sequence from which we'll get our kmers
|
||||
* @return true if we succeeded in creating a good graph from the reference sequence, false otherwise
|
||||
*/
|
||||
protected boolean addReferenceKmersToGraph(final DeBruijnGraph graph, final byte[] refSequence) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( graph.vertexSet().size() != 0 ) throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + graph.vertexSet().size() + " vertices in it already: " + graph);
|
||||
if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null");
|
||||
|
||||
|
||||
final int kmerLength = graph.getKmerSize();
|
||||
if( refSequence.length < kmerLength + KMER_OVERLAP ) {
|
||||
// not enough reference sequence to build a kmer graph of this length, return null
|
||||
return false;
|
||||
}
|
||||
|
||||
final int kmersInSequence = refSequence.length - kmerLength + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
graph.addKmerPairFromSeqToGraph(refSequence, iii, true, 1);
|
||||
}
|
||||
|
||||
// we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices
|
||||
if ( graph.vertexSet().size() != kmersInSequence ) {
|
||||
if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Error correct the kmers in this graph, returning a new graph built from those error corrected kmers
|
||||
* @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason
|
||||
* we cannot actually do the error correction
|
||||
*/
|
||||
public DeBruijnGraph errorCorrect(final DeBruijnGraph graph) {
|
||||
final KMerErrorCorrector corrector = new KMerErrorCorrector(graph.getKmerSize(), 1, 1, 5); // TODO -- should be static variables
|
||||
|
||||
for( final BaseEdge e : graph.edgeSet() ) {
|
||||
for ( final byte[] kmer : Arrays.asList(graph.getEdgeSource(e).getSequence(), graph.getEdgeTarget(e).getSequence())) {
|
||||
// TODO -- need a cleaner way to deal with the ref weight
|
||||
corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity());
|
||||
}
|
||||
}
|
||||
|
||||
if ( corrector.computeErrorCorrectionMap() ) {
|
||||
final DeBruijnGraph correctedGraph = new DeBruijnGraph(graph.getKmerSize());
|
||||
|
||||
for( final BaseEdge e : graph.edgeSet() ) {
|
||||
final byte[] source = corrector.getErrorCorrectedKmer(graph.getEdgeSource(e).getSequence());
|
||||
final byte[] target = corrector.getErrorCorrectedKmer(graph.getEdgeTarget(e).getSequence());
|
||||
if ( source != null && target != null ) {
|
||||
correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity());
|
||||
}
|
||||
}
|
||||
|
||||
return correctedGraph;
|
||||
} else {
|
||||
// the error correction wasn't possible, simply return this graph
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
|
||||
protected void printGraphs(final List<SeqGraph> graphs) {
|
||||
|
|
@ -287,7 +367,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
continue;
|
||||
}
|
||||
|
||||
graph.printGraph(graphWriter, false, PRUNE_FACTOR);
|
||||
graph.printGraph(graphWriter, false, pruneFactor);
|
||||
|
||||
if ( debugGraphTransformations )
|
||||
break;
|
||||
|
|
@ -322,6 +402,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
for ( final Path<SeqVertex> path : new KBestPaths<SeqVertex>().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
|
||||
// logger.info("Found path " + path);
|
||||
Haplotype h = new Haplotype( path.getBases() );
|
||||
if( !returnHaplotypes.contains(h) ) {
|
||||
final Cigar cigar = path.calculateCigar();
|
||||
|
|
@ -377,13 +458,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
if( debug ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
|
||||
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
|
||||
} else {
|
||||
System.out.println("Found only the reference haplotype in the assembly graph.");
|
||||
logger.info("Found only the reference haplotype in the assembly graph.");
|
||||
}
|
||||
for( final Haplotype h : returnHaplotypes ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
|
||||
logger.info( h.toString() );
|
||||
logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ import com.google.java.contract.Requires;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
|
|
@ -66,6 +67,7 @@ import java.io.PrintStream;
|
|||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
private final static Logger logger = Logger.getLogger(GenotypingEngine.class);
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
|
||||
|
|
@ -168,15 +170,15 @@ public class GenotypingEngine {
|
|||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
if( DEBUG ) { logger.info("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
System.out.println( ">> Events = " + h.getEventMap());
|
||||
logger.info(h.toString());
|
||||
logger.info("> Cigar = " + h.getCigar());
|
||||
logger.info(">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -261,7 +263,7 @@ public class GenotypingEngine {
|
|||
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
|
|
@ -500,9 +502,9 @@ public class GenotypingEngine {
|
|||
if( isBiallelic ) {
|
||||
final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) );
|
||||
if( DEBUG ) {
|
||||
System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
|
||||
System.out.println("-- " + thisVC);
|
||||
System.out.println("-- " + nextVC);
|
||||
logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
|
||||
logger.info("-- " + thisVC);
|
||||
logger.info("-- " + nextVC);
|
||||
}
|
||||
if( R2 > MERGE_EVENTS_R2_THRESHOLD ) {
|
||||
|
||||
|
|
@ -528,7 +530,7 @@ public class GenotypingEngine {
|
|||
if(!containsStart) { startPosKeySet.remove(thisStart); }
|
||||
if(!containsNext) { startPosKeySet.remove(nextStart); }
|
||||
|
||||
if( DEBUG ) { System.out.println("====> " + mergedVC); }
|
||||
if( DEBUG ) { logger.info("====> " + mergedVC); }
|
||||
mapWasUpdated = true;
|
||||
break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
|
||||
}
|
||||
|
|
|
|||
|
|
@ -171,6 +171,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png
|
||||
*
|
||||
*/
|
||||
@Advanced
|
||||
@Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false)
|
||||
protected StingSAMFileWriter bamWriter = null;
|
||||
private HaplotypeBAMWriter haplotypeBAMWriter;
|
||||
|
|
@ -178,12 +179,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
/**
|
||||
* The type of BAM output we want to see.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false)
|
||||
public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
|
||||
|
||||
|
|
@ -191,6 +194,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 1;
|
||||
|
||||
|
|
@ -200,7 +204,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
@Advanced
|
||||
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
|
||||
protected int maxNumHaplotypesInPopulation = 13;
|
||||
protected int maxNumHaplotypesInPopulation = 25;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
|
||||
|
|
@ -217,6 +221,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
|
||||
protected boolean includeUnmappedReads = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
|
||||
protected boolean USE_ALLELES_TRIGGER = false;
|
||||
|
||||
|
|
@ -232,6 +237,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean dontGenotype = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected boolean errorCorrectKmers = false;
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
|
|
@ -246,6 +255,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Advanced
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
|
@ -258,6 +268,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
|
||||
|
||||
|
|
@ -265,6 +276,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
|
||||
|
|
@ -277,12 +289,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@ArgumentCollection
|
||||
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
|
||||
protected boolean DEBUG;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
|
||||
protected int debugGraphTransformations = -1;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
|
||||
protected boolean useLowQualityBasesForAssembly = false;
|
||||
|
||||
|
|
@ -388,8 +403,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE;
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly );
|
||||
// setup the assembler
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer);
|
||||
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
||||
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
||||
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
|
||||
if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
|
||||
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
|
||||
|
||||
|
|
@ -520,7 +540,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
|
||||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
|
|
@ -537,8 +557,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final List<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ?
|
||||
likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes );
|
||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesForGenotyping(haplotypes, stratifiedReadMap);
|
||||
|
||||
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
||||
bestHaplotypes,
|
||||
|
|
@ -561,11 +580,27 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap);
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
||||
|
||||
return 1; // One active region was processed during this map call
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the best N haplotypes according to their likelihoods, if appropriate
|
||||
*
|
||||
* @param haplotypes a list of haplotypes to consider
|
||||
* @param stratifiedReadMap a map from samples -> read likelihoods
|
||||
* @return the list of haplotypes to genotype
|
||||
*/
|
||||
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
// TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping)
|
||||
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
return haplotypes;
|
||||
} else {
|
||||
return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
@ -594,7 +629,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
|
||||
activeRegion.clearReads();
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -62,6 +63,7 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
import java.util.*;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final byte constantGCP;
|
||||
|
|
@ -229,7 +231,7 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public List<Haplotype> selectBestHaplotypes( final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation ) {
|
||||
public List<Haplotype> selectBestHaplotypesFromPooledLikelihoods(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
|
||||
|
|
@ -256,14 +258,14 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
|
||||
if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
|
||||
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
|
||||
final List<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.Haplotype;
|
|||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -59,13 +60,46 @@ import java.util.List;
|
|||
* Date: Mar 14, 2011
|
||||
*/
|
||||
public abstract class LocalAssemblyEngine {
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16;
|
||||
|
||||
public enum ASSEMBLER {
|
||||
SIMPLE_DE_BRUIJN
|
||||
protected PrintStream graphWriter = null;
|
||||
protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
|
||||
protected int pruneFactor = 2;
|
||||
protected boolean errorCorrectKmers = false;
|
||||
|
||||
protected LocalAssemblyEngine() { }
|
||||
|
||||
public int getPruneFactor() {
|
||||
return pruneFactor;
|
||||
}
|
||||
|
||||
protected LocalAssemblyEngine() {
|
||||
public void setPruneFactor(int pruneFactor) {
|
||||
this.pruneFactor = pruneFactor;
|
||||
}
|
||||
|
||||
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List<VariantContext> activeAllelesToGenotype);
|
||||
public boolean shouldErrorCorrectKmers() {
|
||||
return errorCorrectKmers;
|
||||
}
|
||||
|
||||
public void setErrorCorrectKmers(boolean errorCorrectKmers) {
|
||||
this.errorCorrectKmers = errorCorrectKmers;
|
||||
}
|
||||
|
||||
public PrintStream getGraphWriter() {
|
||||
return graphWriter;
|
||||
}
|
||||
|
||||
public void setGraphWriter(PrintStream graphWriter) {
|
||||
this.graphWriter = graphWriter;
|
||||
}
|
||||
|
||||
public byte getMinBaseQualityToUseInAssembly() {
|
||||
return minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) {
|
||||
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List<VariantContext> activeAllelesToGenotype);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,9 +44,10 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
|
|
@ -68,7 +69,7 @@ public class BaseEdge {
|
|||
* @param multiplicity the number of observations of this edge
|
||||
*/
|
||||
public BaseEdge(final boolean isRef, final int multiplicity) {
|
||||
if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0");
|
||||
if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0 but got " + multiplicity);
|
||||
|
||||
this.multiplicity = multiplicity;
|
||||
this.isRef = isRef;
|
||||
|
|
@ -151,10 +152,39 @@ public class BaseEdge {
|
|||
* multiplicity is the sum
|
||||
*
|
||||
* @param edge the edge to add
|
||||
* @return this
|
||||
*/
|
||||
public void add(final BaseEdge edge) {
|
||||
public BaseEdge add(final BaseEdge edge) {
|
||||
if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
|
||||
this.multiplicity += edge.getMultiplicity();
|
||||
this.isRef = this.isRef || edge.isRef();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new BaseEdge with multiplicity and isRef that's an or of all edges
|
||||
*
|
||||
* @param edges a collection of edges to or their isRef values
|
||||
* @param multiplicity our desired multiplicity
|
||||
* @return a newly allocated BaseEdge
|
||||
*/
|
||||
public static BaseEdge orRef(final Collection<BaseEdge> edges, final int multiplicity) {
|
||||
for ( final BaseEdge e : edges )
|
||||
if ( e.isRef() )
|
||||
return new BaseEdge(true, multiplicity);
|
||||
return new BaseEdge(false, multiplicity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new edge whose multiplicity is the max of this and edge, and isRef is or of this and edge
|
||||
*
|
||||
* isRef is simply the or of this and edge
|
||||
* multiplicity is the max
|
||||
*
|
||||
* @param edge the edge to max
|
||||
*/
|
||||
public BaseEdge max(final BaseEdge edge) {
|
||||
if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
|
||||
return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity()));
|
||||
}
|
||||
}
|
||||
|
|
@ -44,10 +44,11 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.jgrapht.EdgeFactory;
|
||||
|
|
@ -310,15 +311,33 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
addVertex(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple vertices to the graph at once
|
||||
* @param vertices one or more vertices to add
|
||||
*/
|
||||
public void addVertices(final Collection<T> vertices) {
|
||||
for ( final T v : vertices )
|
||||
addVertex(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple edges to the graph
|
||||
* @param start the first vertex to connect
|
||||
* @param remaining all additional vertices to connect
|
||||
*/
|
||||
public void addEdges(final T start, final T ... remaining) {
|
||||
addEdges(new BaseEdge(false, 1), start, remaining);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple edges to the graph
|
||||
* @param start the first vertex to connect
|
||||
* @param remaining all additional vertices to connect
|
||||
*/
|
||||
public void addEdges(final BaseEdge template, final T start, final T ... remaining) {
|
||||
T prev = start;
|
||||
for ( final T next : remaining ) {
|
||||
addEdge(prev, next);
|
||||
addEdge(prev, next, new BaseEdge(template));
|
||||
prev = next;
|
||||
}
|
||||
}
|
||||
|
|
@ -366,19 +385,15 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
}
|
||||
}
|
||||
|
||||
// TODO -- generalize to support both types of graphs. Need some kind of display string function
|
||||
public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) {
|
||||
if ( writeHeader )
|
||||
graphWriter.println("digraph assemblyGraphs {");
|
||||
|
||||
for( final BaseEdge edge : edgeSet() ) {
|
||||
// if( edge.getMultiplicity() > PRUNE_FACTOR ) {
|
||||
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
|
||||
// }
|
||||
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
|
||||
if( edge.isRef() ) {
|
||||
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
//if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
|
||||
}
|
||||
|
||||
for( final T v : vertexSet() ) {
|
||||
|
|
@ -389,7 +404,12 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
graphWriter.println("}");
|
||||
}
|
||||
|
||||
protected void cleanNonRefPaths() {
|
||||
/**
|
||||
* Remove edges that are connected before the reference source and after the reference sink
|
||||
*
|
||||
* Also removes all vertices that are orphaned by this process
|
||||
*/
|
||||
public void cleanNonRefPaths() {
|
||||
if( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -416,6 +436,30 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
edgesToCheck.remove(e);
|
||||
}
|
||||
|
||||
removeSingletonOrphanVertices();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well
|
||||
*
|
||||
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
|
||||
*/
|
||||
public void pruneGraph( final int pruneFactor ) {
|
||||
final List<BaseEdge> edgesToRemove = new ArrayList<BaseEdge>();
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
|
||||
edgesToRemove.add(e);
|
||||
}
|
||||
}
|
||||
removeAllEdges(edgesToRemove);
|
||||
|
||||
removeSingletonOrphanVertices();
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all vertices in the graph that have in and out degree of 0
|
||||
*/
|
||||
protected void removeSingletonOrphanVertices() {
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<T> verticesToRemove = new LinkedList<T>();
|
||||
for( final T v : vertexSet() ) {
|
||||
|
|
@ -426,46 +470,52 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
protected void pruneGraph( final int pruneFactor ) {
|
||||
final List<BaseEdge> edgesToRemove = new ArrayList<BaseEdge>();
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
|
||||
edgesToRemove.add(e);
|
||||
}
|
||||
}
|
||||
removeAllEdges(edgesToRemove);
|
||||
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<T> verticesToRemove = new ArrayList<T>();
|
||||
for( final T v : vertexSet() ) {
|
||||
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
|
||||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
public void removeVerticesNotConnectedToRef() {
|
||||
/**
|
||||
* Remove all vertices on the graph that cannot be accessed by following any edge,
|
||||
* regardless of its direction, from the reference source vertex
|
||||
*/
|
||||
public void removeVerticesNotConnectedToRefRegardlessOfEdgeDirection() {
|
||||
final HashSet<T> toRemove = new HashSet<T>(vertexSet());
|
||||
final HashSet<T> visited = new HashSet<T>();
|
||||
|
||||
final LinkedList<T> toVisit = new LinkedList<T>();
|
||||
final T refV = getReferenceSourceVertex();
|
||||
if ( refV != null ) {
|
||||
toVisit.add(refV);
|
||||
while ( ! toVisit.isEmpty() ) {
|
||||
final T v = toVisit.pop();
|
||||
if ( ! visited.contains(v) ) {
|
||||
toRemove.remove(v);
|
||||
visited.add(v);
|
||||
for ( final T prev : incomingVerticesOf(v) ) toVisit.add(prev);
|
||||
for ( final T next : outgoingVerticesOf(v) ) toVisit.add(next);
|
||||
}
|
||||
for ( final T v : new BaseGraphIterator<T>(this, refV, true, true) ) {
|
||||
toRemove.remove(v);
|
||||
}
|
||||
|
||||
removeAllVertices(toRemove);
|
||||
}
|
||||
|
||||
removeAllVertices(toRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex
|
||||
*
|
||||
* More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection,
|
||||
* as it requires vertices to not only be connected by a series of directed edges but also prunes away
|
||||
* paths that do not also meet eventually with the reference sink vertex
|
||||
*/
|
||||
public void removePathsNotConnectedToRef() {
|
||||
if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
|
||||
throw new IllegalStateException("Graph must have ref source and sink vertices");
|
||||
}
|
||||
|
||||
// get the set of vertices we can reach by going forward from the ref source
|
||||
final Set<T> onPathFromRefSource = new HashSet<T>(vertexSet().size());
|
||||
for ( final T v : new BaseGraphIterator<T>(this, getReferenceSourceVertex(), false, true) ) {
|
||||
onPathFromRefSource.add(v);
|
||||
}
|
||||
|
||||
// get the set of vertices we can reach by going backward from the ref sink
|
||||
final Set<T> onPathFromRefSink = new HashSet<T>(vertexSet().size());
|
||||
for ( final T v : new BaseGraphIterator<T>(this, getReferenceSinkVertex(), true, false) ) {
|
||||
onPathFromRefSink.add(v);
|
||||
}
|
||||
|
||||
// we want to remove anything that's not in both the sink and source sets
|
||||
final Set<T> verticesToRemove = new HashSet<T>(vertexSet());
|
||||
onPathFromRefSource.retainAll(onPathFromRefSink);
|
||||
verticesToRemove.removeAll(onPathFromRefSource);
|
||||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -515,17 +565,48 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the edge between source and target, or null if none is present
|
||||
*
|
||||
* Note that since we don't allow multiple edges between vertices there can be at most
|
||||
* one edge between any two edges
|
||||
*
|
||||
* @param source the source vertex for our edge
|
||||
* @param target the target vertex for our edge
|
||||
* @return the edge joining source to target, or null if none is present
|
||||
* Get the incoming edge of v. Requires that there be only one such edge or throws an error
|
||||
* @param v our vertex
|
||||
* @return the single incoming edge to v, or null if none exists
|
||||
*/
|
||||
public BaseEdge getEdge(final T source, final T target) {
|
||||
final Set<BaseEdge> edges = getAllEdges(source, target);
|
||||
public BaseEdge incomingEdgeOf(final T v) {
|
||||
return getSingletonEdge(incomingEdgesOf(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the outgoing edge of v. Requires that there be only one such edge or throws an error
|
||||
* @param v our vertex
|
||||
* @return the single outgoing edge from v, or null if none exists
|
||||
*/
|
||||
public BaseEdge outgoingEdgeOf(final T v) {
|
||||
return getSingletonEdge(outgoingEdgesOf(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that gets the a single edge from edges, null if edges is empty, or
|
||||
* throws an error is edges has more than 1 element
|
||||
* @param edges a set of edges
|
||||
* @return a edge
|
||||
*/
|
||||
@Requires("edges != null")
|
||||
private BaseEdge getSingletonEdge(final Collection<BaseEdge> edges) {
|
||||
if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges);
|
||||
return edges.isEmpty() ? null : edges.iterator().next();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add edge between source -> target if none exists, or add e to an already existing one if present
|
||||
*
|
||||
* @param source source vertex
|
||||
* @param target vertex
|
||||
* @param e edge to add
|
||||
*/
|
||||
public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) {
|
||||
final BaseEdge prev = getEdge(source, target);
|
||||
if ( prev != null ) {
|
||||
prev.add(e);
|
||||
} else {
|
||||
addEdge(source, target, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* General iterator that can iterate over all vertices in a BaseGraph, following either
|
||||
* incoming, outgoing edge (as well as both or none) edges. Supports traversal of graphs
|
||||
* with cycles and other crazy structures. Will only ever visit each vertex once. The
|
||||
* order in which the vertices are visited is undefined.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/24/13
|
||||
* Time: 4:41 PM
|
||||
*/
|
||||
public class BaseGraphIterator<T extends BaseVertex> implements Iterator<T>, Iterable<T> {
|
||||
final HashSet<T> visited = new HashSet<T>();
|
||||
final LinkedList<T> toVisit = new LinkedList<T>();
|
||||
final BaseGraph<T> graph;
|
||||
final boolean followIncomingEdges, followOutgoingEdges;
|
||||
|
||||
/**
|
||||
* Create a new BaseGraphIterator starting its traversal at start
|
||||
*
|
||||
* Note that if both followIncomingEdges and followOutgoingEdges are false, we simply return the
|
||||
* start vertex
|
||||
*
|
||||
* @param graph the graph to iterator over. Cannot be null
|
||||
* @param start the vertex to start at. Cannot be null
|
||||
* @param followIncomingEdges should we follow incoming edges during our
|
||||
* traversal? (goes backward through the graph)
|
||||
* @param followOutgoingEdges should we follow outgoing edges during out traversal?
|
||||
*/
|
||||
public BaseGraphIterator(final BaseGraph<T> graph, final T start,
|
||||
final boolean followIncomingEdges, final boolean followOutgoingEdges) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( start == null ) throw new IllegalArgumentException("start cannot be null");
|
||||
if ( ! graph.containsVertex(start) ) throw new IllegalArgumentException("start " + start + " must be in graph but it isn't");
|
||||
this.graph = graph;
|
||||
this.followIncomingEdges = followIncomingEdges;
|
||||
this.followOutgoingEdges = followOutgoingEdges;
|
||||
|
||||
toVisit.add(start);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<T> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return ! toVisit.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
final T v = toVisit.pop();
|
||||
|
||||
if ( ! visited.contains(v) ) {
|
||||
visited.add(v);
|
||||
if ( followIncomingEdges ) for ( final T prev : graph.incomingVerticesOf(v) ) toVisit.add(prev);
|
||||
if ( followOutgoingEdges ) for ( final T next : graph.outgoingVerticesOf(v) ) toVisit.add(next);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Doesn't implement remove");
|
||||
}
|
||||
}
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
|
|
@ -69,10 +69,21 @@ public class BaseVertex {
|
|||
*/
|
||||
public BaseVertex(final byte[] sequence) {
|
||||
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
|
||||
if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty");
|
||||
this.sequence = sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this vertex have an empty sequence?
|
||||
*
|
||||
* That is, is it a dummy node that's only present for structural reasons but doesn't actually
|
||||
* contribute to the sequence of the graph?
|
||||
*
|
||||
* @return true if sequence is empty, false otherwise
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
return length() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length of this sequence
|
||||
* @return a positive integer >= 1
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Split a collection of middle nodes in a graph into their shared prefix and suffix values
|
||||
*
|
||||
* This code performs the following transformation. Suppose I have a set of vertices V, such
|
||||
* that each vertex is composed of sequence such that
|
||||
*
|
||||
* Vi = prefix + seq_i + suffix
|
||||
*
|
||||
* where prefix and suffix are shared sequences across all vertices V. This replaces each
|
||||
* Vi with three nodes prefix, seq_i, and suffix connected in a simple chain.
|
||||
*
|
||||
* This operation can be performed in a very general case, without too much worry about the incoming
|
||||
* and outgoing edge structure of each Vi. The partner algorithm SharedSequenceMerger can
|
||||
* put these pieces back together in a smart way that maximizes the sharing of nodes
|
||||
* while respecting complex connectivity.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/22/13
|
||||
* Time: 8:31 AM
|
||||
*/
|
||||
public class CommonSuffixSplitter {
|
||||
/**
|
||||
* Create a new graph that contains the vertices in toMerge with their shared suffix and prefix
|
||||
* sequences extracted out.
|
||||
*
|
||||
*/
|
||||
public CommonSuffixSplitter() {}
|
||||
|
||||
/**
|
||||
* Simple single-function interface to split and then update a graph
|
||||
*
|
||||
* @param graph the graph containing the vertices in toMerge
|
||||
* @param v The bottom node whose incoming vertices we'd like to split
|
||||
* @return true if some useful splitting was done, false otherwise
|
||||
*/
|
||||
public boolean split(final SeqGraph graph, final SeqVertex v) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( v == null ) throw new IllegalArgumentException("v cannot be null");
|
||||
if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v);
|
||||
|
||||
final Collection<SeqVertex> toMerge = graph.incomingVerticesOf(v);
|
||||
if ( toMerge.size() < 2 )
|
||||
// Can only split at least 2 vertices
|
||||
return false;
|
||||
else if ( ! safeToSplit(graph, v, toMerge) ) {
|
||||
return false;
|
||||
} else {
|
||||
final SeqVertex suffixVTemplate = commonSuffix(toMerge);
|
||||
if ( suffixVTemplate.isEmpty() ) {
|
||||
return false;
|
||||
} else {
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
|
||||
// graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0);
|
||||
for ( final SeqVertex mid : toMerge ) {
|
||||
// create my own copy of the suffix
|
||||
final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence());
|
||||
graph.addVertex(suffixV);
|
||||
final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence());
|
||||
final BaseEdge out = graph.outgoingEdgeOf(mid);
|
||||
|
||||
final SeqVertex incomingTarget;
|
||||
if ( prefixV == null ) {
|
||||
// this node is entirely explained by suffix
|
||||
incomingTarget = suffixV;
|
||||
} else {
|
||||
incomingTarget = prefixV;
|
||||
graph.addVertex(prefixV);
|
||||
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0));
|
||||
edgesToRemove.add(out);
|
||||
}
|
||||
|
||||
graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out));
|
||||
|
||||
for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) {
|
||||
graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in));
|
||||
edgesToRemove.add(in);
|
||||
}
|
||||
}
|
||||
|
||||
graph.removeAllVertices(toMerge);
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
// graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// private static int counter = 0;
|
||||
|
||||
/**
|
||||
* Can we safely split up the vertices in toMerge?
|
||||
*
|
||||
* @param graph a graph
|
||||
* @param bot a vertex whose incoming vertices we want to split
|
||||
* @param toMerge the set of vertices we'd be splitting up
|
||||
* @return true if we can safely split up toMerge
|
||||
*/
|
||||
private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection<SeqVertex> toMerge) {
|
||||
for ( final SeqVertex m : toMerge ) {
|
||||
final Set<BaseEdge> outs = graph.outgoingEdgesOf(m);
|
||||
if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) )
|
||||
// m == bot => don't allow cycles in the graph
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the longest suffix of bases shared among all provided vertices
|
||||
*
|
||||
* For example, if the vertices have sequences AC, CC, and ATC, this would return
|
||||
* a single C. However, for ACC and TCC this would return CC. And for AC and TG this
|
||||
* would return null;
|
||||
*
|
||||
* @param middleVertices a non-empty set of vertices
|
||||
* @return a single vertex that contains the common suffix of all middle vertices
|
||||
*/
|
||||
@Requires("!middleVertices.isEmpty()")
|
||||
protected static SeqVertex commonSuffix(final Collection<SeqVertex> middleVertices) {
|
||||
final List<byte[]> kmers = Utils.getKmers(middleVertices);
|
||||
final int min = Utils.minKmerLength(kmers);
|
||||
final int suffixLen = Utils.compSuffixLen(kmers, min);
|
||||
final byte[] kmer = kmers.get(0);
|
||||
final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length);
|
||||
return new SeqVertex(suffix);
|
||||
}
|
||||
}
|
||||
|
|
@ -44,9 +44,10 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerErrorCorrector;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
|
@ -88,68 +89,42 @@ public class DeBruijnGraph extends BaseGraph<DeBruijnVertex> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Error correct the kmers in this graph, returning a new graph built from those error corrected kmers
|
||||
* @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason
|
||||
* we cannot actually do the error correction
|
||||
*/
|
||||
protected DeBruijnGraph errorCorrect() {
|
||||
final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables
|
||||
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) {
|
||||
// TODO -- need a cleaner way to deal with the ref weight
|
||||
corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity());
|
||||
}
|
||||
}
|
||||
|
||||
if ( corrector.computeErrorCorrectionMap() ) {
|
||||
final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize());
|
||||
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence());
|
||||
final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence());
|
||||
if ( source != null && target != null ) {
|
||||
correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity());
|
||||
}
|
||||
}
|
||||
|
||||
return correctedGraph;
|
||||
} else {
|
||||
// the error correction wasn't possible, simply return this graph
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add edge to assembly graph connecting the two kmers
|
||||
* @param kmer1 the source kmer for the edge
|
||||
* @param kmer2 the target kmer for the edge
|
||||
* @param isRef true if the added edge is a reference edge
|
||||
* @return will return false if trying to add a reference edge which creates a cycle in the assembly graph
|
||||
*/
|
||||
public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) {
|
||||
public void addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) {
|
||||
if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
|
||||
if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
|
||||
if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); }
|
||||
|
||||
final int numVertexBefore = vertexSet().size();
|
||||
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 );
|
||||
addVertex(v1);
|
||||
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 );
|
||||
addVertex(v2);
|
||||
if( isRef && vertexSet().size() == numVertexBefore ) { return false; }
|
||||
final BaseEdge toAdd = new BaseEdge(isRef, multiplicity);
|
||||
|
||||
final BaseEdge targetEdge = getEdge(v1, v2);
|
||||
if ( targetEdge == null ) {
|
||||
addEdge(v1, v2, new BaseEdge( isRef, multiplicity ));
|
||||
} else {
|
||||
if( isRef ) {
|
||||
targetEdge.setIsRef( true );
|
||||
}
|
||||
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity);
|
||||
}
|
||||
return true;
|
||||
addVertices(v1, v2);
|
||||
addOrUpdateEdge(v1, v2, toAdd);
|
||||
}
|
||||
|
||||
/**
|
||||
* Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this
|
||||
* graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The
|
||||
* edge between them is added with isRef and multiplicity
|
||||
*
|
||||
* @param sequence a sequence of bases from which we want to extract a pair of kmers
|
||||
* @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize()
|
||||
* @param isRef should the edge between the two kmers be a reference edge?
|
||||
* @param multiplicity what's the multiplicity of the edge between these two kmers
|
||||
*/
|
||||
public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final boolean isRef, final int multiplicity ) {
|
||||
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
|
||||
if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
|
||||
if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length);
|
||||
final byte[] kmer1 = Arrays.copyOfRange(sequence, start, start + getKmerSize());
|
||||
final byte[] kmer2 = Arrays.copyOfRange(sequence, start + 1, start + 1 + getKmerSize());
|
||||
addKmersToGraph(kmer1, kmer2, isRef, multiplicity);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -161,7 +136,7 @@ public class DeBruijnGraph extends BaseGraph<DeBruijnVertex> {
|
|||
* @return a newly allocated SequenceGraph
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
protected SeqGraph convertToSequenceGraph() {
|
||||
public SeqGraph convertToSequenceGraph() {
|
||||
final SeqGraph seqGraph = new SeqGraph(getKmerSize());
|
||||
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<DeBruijnVertex, SeqVertex>();
|
||||
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.common.collect.MinMaxPriorityQueue;
|
||||
import com.google.java.contract.Ensures;
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
|
@ -67,7 +67,9 @@ import java.util.*;
|
|||
* Time: 2:34 PM
|
||||
*
|
||||
*/
|
||||
class Path<T extends BaseVertex> {
|
||||
public class Path<T extends BaseVertex> {
|
||||
private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20;
|
||||
|
||||
// the last vertex seen in the path
|
||||
private final T lastVertex;
|
||||
|
||||
|
|
@ -161,8 +163,9 @@ class Path<T extends BaseVertex> {
|
|||
boolean first = true;
|
||||
for ( final T v : getVertices() ) {
|
||||
if ( first ) {
|
||||
b.append(" -> ");
|
||||
first = false;
|
||||
} else {
|
||||
b.append(" -> ");
|
||||
}
|
||||
b.append(v.getSequenceString());
|
||||
}
|
||||
|
|
@ -357,7 +360,7 @@ class Path<T extends BaseVertex> {
|
|||
}
|
||||
|
||||
final Cigar swCigar = swConsensus.getCigar();
|
||||
if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference
|
||||
if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference
|
||||
returnCigar.add(new CigarElement(1, CigarOperator.N));
|
||||
} else {
|
||||
for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
|
||||
|
|
@ -391,4 +394,30 @@ class Path<T extends BaseVertex> {
|
|||
cigar = initialCigar;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that this and other have the same score and vertices in the same order with the same seq
|
||||
* @param other the other path to consider. Cannot be null
|
||||
* @return true if this and path are equal, false otherwise
|
||||
*/
|
||||
public boolean equalScoreAndSequence(final Path<T> other) {
|
||||
if ( other == null ) throw new IllegalArgumentException("other cannot be null");
|
||||
return getScore() == other.getScore() && equalSequence(other);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that this and other have the same vertices in the same order with the same seq
|
||||
* @param other the other path to consider. Cannot be null
|
||||
* @return true if this and path are equal, false otherwise
|
||||
*/
|
||||
public boolean equalSequence(final Path<T> other) {
|
||||
final List<T> mine = getVertices();
|
||||
final List<T> yours = other.getVertices();
|
||||
if ( mine.size() == yours.size() ) { // hehehe
|
||||
for ( int i = 0; i < mine.size(); i++ )
|
||||
if ( ! mine.get(i).seqEquals(yours.get(i)) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -44,15 +44,13 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A graph that contains base sequence at each node
|
||||
|
|
@ -61,6 +59,9 @@ import java.util.*;
|
|||
* @since 03/2013
|
||||
*/
|
||||
public class SeqGraph extends BaseGraph<SeqVertex> {
|
||||
private final static boolean PRINT_SIMPLIFY_GRAPHS = false;
|
||||
private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5;
|
||||
|
||||
/**
|
||||
* Construct an empty SeqGraph
|
||||
*/
|
||||
|
|
@ -86,18 +87,45 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
* in any way the sequences implied by a complex enumeration of all paths through the graph.
|
||||
*/
|
||||
public void simplifyGraph() {
|
||||
simplifyGraph(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
protected void simplifyGraph(final int maxCycles) {
|
||||
boolean didSomeWork;
|
||||
int i = 0;
|
||||
|
||||
// start off with one round of zipping of chains for performance reasons
|
||||
zipLinearChains();
|
||||
mergeBranchingNodes();
|
||||
zipLinearChains();
|
||||
do {
|
||||
//logger.info("simplifyGraph iteration " + i);
|
||||
// iterate until we haven't don't anything useful
|
||||
didSomeWork = false;
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0);
|
||||
didSomeWork |= new MergeDiamonds().transformUntilComplete();
|
||||
didSomeWork |= new MergeTails().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0);
|
||||
|
||||
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0);
|
||||
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0);
|
||||
|
||||
didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
|
||||
didSomeWork |= zipLinearChains();
|
||||
i++;
|
||||
} while (didSomeWork && i < maxCycles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Zip up all of the simple linear chains present in this graph.
|
||||
*/
|
||||
protected void zipLinearChains() {
|
||||
public boolean zipLinearChains() {
|
||||
boolean foundOne = false;
|
||||
while( zipOneLinearChain() ) {
|
||||
// just keep going until zipOneLinearChain says its done
|
||||
foundOne = true;
|
||||
}
|
||||
return foundOne;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -119,13 +147,16 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
|
||||
final Set<BaseEdge> outEdges = outgoingEdgesOf(outgoingVertex);
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(incomingVertex);
|
||||
final BaseEdge singleOutEdge = outEdges.isEmpty() ? null : outEdges.iterator().next();
|
||||
final BaseEdge singleInEdge = inEdges.isEmpty() ? null : inEdges.iterator().next();
|
||||
|
||||
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
singleInEdge.setMultiplicity( singleInEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
singleOutEdge.setMultiplicity( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
} else if( inEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
singleInEdge.setMultiplicity( Math.max(singleInEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) );
|
||||
} else if( outEdges.size() == 1 ) {
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
singleOutEdge.setMultiplicity( Math.max( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) );
|
||||
}
|
||||
|
||||
final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) );
|
||||
|
|
@ -147,193 +178,219 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Perform as many branch simplifications and merging operations as possible on this graph,
|
||||
* modifying it in place.
|
||||
* Base class for transformation operations that need to iterate over proposed vertices, where
|
||||
* each proposed vertex is a seed vertex for a potential transformation.
|
||||
*
|
||||
* transformUntilComplete will iteratively apply the tryToTransform function on each vertex in the graph
|
||||
* until no vertex can be found that can be transformed.
|
||||
*
|
||||
* Note that in order to eventually terminate tryToTransform must transform the graph such that eventually
|
||||
* no vertices are candidates for further transformations.
|
||||
*/
|
||||
protected void mergeBranchingNodes() {
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
private abstract class VertexBasedTransformer {
|
||||
/**
|
||||
* For testing purposes we sometimes want to test that can be transformed capabilities are working
|
||||
* without actually modifying the graph */
|
||||
private boolean dontModifyGraphEvenIfPossible = false;
|
||||
|
||||
for( final SeqVertex v : vertexSet() ) {
|
||||
foundNodesToMerge = simplifyDiamondIfPossible(v);
|
||||
if ( foundNodesToMerge )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
public boolean dontModifyGraphEvenIfPossible() { return dontModifyGraphEvenIfPossible; }
|
||||
public void setDontModifyGraphEvenIfPossible() { this.dontModifyGraphEvenIfPossible = true; }
|
||||
|
||||
/**
|
||||
* A simple structure that looks like:
|
||||
*
|
||||
* v
|
||||
* / | \ \
|
||||
* m1 m2 m3 ... mn
|
||||
* \ | / /
|
||||
* b
|
||||
*
|
||||
* Only returns true if all outgoing edges of v go to vertices that all only connect to
|
||||
* a single bottom node, and that all middle nodes have only the single edge
|
||||
*
|
||||
* @param v the vertex to test if its the top of a diamond pattern
|
||||
* @return true if v is the root of a diamond
|
||||
*/
|
||||
protected boolean isRootOfDiamond(final SeqVertex v) {
|
||||
final Set<BaseEdge> ve = outgoingEdgesOf(v);
|
||||
if ( ve.size() <= 1 )
|
||||
return false;
|
||||
/**
|
||||
* Merge until the graph has no vertices that are candidates for merging
|
||||
*/
|
||||
public boolean transformUntilComplete() {
|
||||
boolean didAtLeastOneTranform = false;
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
|
||||
SeqVertex bottom = null;
|
||||
for ( final BaseEdge e : ve ) {
|
||||
final SeqVertex mi = getEdgeTarget(e);
|
||||
|
||||
// all nodes must have at least 1 connection
|
||||
if ( outDegreeOf(mi) < 1 )
|
||||
return false;
|
||||
|
||||
// can only have 1 incoming node, the root vertex
|
||||
if ( inDegreeOf(mi) != 1 )
|
||||
return false;
|
||||
|
||||
// make sure that all outgoing vertices of mi go only to the bottom node
|
||||
for ( final SeqVertex mt : outgoingVerticesOf(mi) ) {
|
||||
if ( bottom == null )
|
||||
bottom = mt;
|
||||
else if ( ! bottom.equals(mt) )
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// bottom has some connections coming in from other nodes, don't allow
|
||||
if ( inDegreeOf(bottom) != ve.size() )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the longest suffix of bases shared among all provided vertices
|
||||
*
|
||||
* For example, if the vertices have sequences AC, CC, and ATC, this would return
|
||||
* a single C. However, for ACC and TCC this would return CC. And for AC and TG this
|
||||
* would return null;
|
||||
*
|
||||
* @param middleVertices a non-empty set of vertices
|
||||
* @return
|
||||
*/
|
||||
@Requires("!middleVertices.isEmpty()")
|
||||
private byte[] commonSuffixOfEdgeTargets(final Set<SeqVertex> middleVertices) {
|
||||
final String[] kmers = new String[middleVertices.size()];
|
||||
|
||||
int i = 0;
|
||||
for ( final SeqVertex v : middleVertices ) {
|
||||
kmers[i++] = (StringUtils.reverse(v.getSequenceString()));
|
||||
}
|
||||
|
||||
final String commonPrefix = StringUtils.getCommonPrefix(kmers);
|
||||
return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the node that is the bottom of a diamond configuration in the graph starting at top
|
||||
*
|
||||
* @param top
|
||||
* @return
|
||||
*/
|
||||
@Requires("top != null")
|
||||
@Ensures({"result != null"})
|
||||
private SeqVertex getDiamondBottom(final SeqVertex top) {
|
||||
final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next();
|
||||
final SeqVertex middle = getEdgeTarget(topEdge);
|
||||
final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next();
|
||||
return getEdgeTarget(middleEdge);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices that are in the middle of a diamond starting at top
|
||||
* @param top
|
||||
* @return
|
||||
*/
|
||||
@Requires("top != null")
|
||||
@Ensures({"result != null", "!result.isEmpty()"})
|
||||
final Set<SeqVertex> getMiddleVertices(final SeqVertex top) {
|
||||
final Set<SeqVertex> middles = new HashSet<SeqVertex>();
|
||||
for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) {
|
||||
middles.add(getEdgeTarget(topToMiddle));
|
||||
}
|
||||
return middles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simply a diamond configuration in the current graph starting at top, if possible
|
||||
*
|
||||
* If top is actually the top of a diamond that can be simplified (i.e., doesn't have any
|
||||
* random edges or other structure that would cause problems with the transformation), then this code
|
||||
* performs the following transformation on this graph (modifying it):
|
||||
*
|
||||
* A -> M1 -> B, A -> M2 -> B, A -> Mn -> B
|
||||
*
|
||||
* becomes
|
||||
*
|
||||
* A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B'
|
||||
*
|
||||
* where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each
|
||||
* middle vertex without their shared suffix.
|
||||
*
|
||||
* @param top a proposed vertex in this graph that might start a diamond (but doesn't have to)
|
||||
* @return true top actually starts a diamond and it could be simplified
|
||||
*/
|
||||
private boolean simplifyDiamondIfPossible(final SeqVertex top) {
|
||||
if ( ! isRootOfDiamond(top) )
|
||||
return false;
|
||||
|
||||
final SeqVertex diamondBottom = getDiamondBottom(top);
|
||||
final Set<SeqVertex> middleVertices = getMiddleVertices(top);
|
||||
final List<SeqVertex> verticesToRemove = new LinkedList<SeqVertex>();
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
|
||||
// all of the edges point to the same sink, so it's time to merge
|
||||
final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices);
|
||||
if ( commonSuffix != null ) {
|
||||
final BaseEdge botToNewBottom = new BaseEdge(false, 0);
|
||||
final BaseEdge elimMiddleNodeEdge = new BaseEdge(false, 0);
|
||||
final SeqVertex newBottomV = new SeqVertex(commonSuffix);
|
||||
addVertex(newBottomV);
|
||||
|
||||
for ( final SeqVertex middle : middleVertices ) {
|
||||
final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix);
|
||||
final BaseEdge topToMiddleEdge = getEdge(top, middle);
|
||||
final BaseEdge middleToBottomE = getEdge(middle, diamondBottom);
|
||||
|
||||
// clip out the two edges, since we'll be replacing them later
|
||||
edgesToRemove.add(topToMiddleEdge);
|
||||
edgesToRemove.add(middleToBottomE);
|
||||
|
||||
if ( withoutSuffix != null ) { // this node is a deletion
|
||||
addVertex(withoutSuffix);
|
||||
// update edge from top -> middle to be top -> without suffix
|
||||
addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge));
|
||||
addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE));
|
||||
} else {
|
||||
// this middle node is == the common suffix, wo we're removing the edge
|
||||
elimMiddleNodeEdge.add(topToMiddleEdge);
|
||||
for( final SeqVertex v : vertexSet() ) {
|
||||
foundNodesToMerge = tryToTransform(v);
|
||||
if ( foundNodesToMerge ) {
|
||||
didAtLeastOneTranform = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// include the ref and multi of mid -> bot in our edge from new bot -> bot
|
||||
botToNewBottom.add(middleToBottomE);
|
||||
verticesToRemove.add(middle);
|
||||
}
|
||||
|
||||
// add an edge from top to new bottom, because some middle nodes were removed
|
||||
if ( elimMiddleNodeEdge.getMultiplicity() > 0 )
|
||||
addEdge(top, newBottomV, elimMiddleNodeEdge);
|
||||
return didAtLeastOneTranform;
|
||||
}
|
||||
|
||||
addEdge(newBottomV, diamondBottom, botToNewBottom);
|
||||
/**
|
||||
* Merge, if possible, seeded on the vertex v
|
||||
* @param v the proposed seed vertex to merge
|
||||
* @return true if some useful merging happened, false otherwise
|
||||
*/
|
||||
abstract boolean tryToTransform(final SeqVertex v);
|
||||
}
|
||||
|
||||
removeAllEdges(edgesToRemove);
|
||||
removeAllVertices(verticesToRemove);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
/**
|
||||
* Merge diamond configurations:
|
||||
*
|
||||
* Performance the transformation:
|
||||
*
|
||||
* { A -> x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { A -> x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*/
|
||||
protected class MergeDiamonds extends VertexBasedTransformer {
|
||||
@Override
|
||||
protected boolean tryToTransform(final SeqVertex top) {
|
||||
final Set<SeqVertex> middles = outgoingVerticesOf(top);
|
||||
if ( middles.size() <= 1 )
|
||||
// we can only merge if there's at least two middle nodes
|
||||
return false;
|
||||
|
||||
SeqVertex bottom = null;
|
||||
for ( final SeqVertex mi : middles ) {
|
||||
// all nodes must have at least 1 connection
|
||||
if ( outDegreeOf(mi) < 1 )
|
||||
return false;
|
||||
|
||||
// can only have 1 incoming node, the root vertex
|
||||
if ( inDegreeOf(mi) != 1 )
|
||||
return false;
|
||||
|
||||
// make sure that all outgoing vertices of mi go only to the bottom node
|
||||
for ( final SeqVertex mt : outgoingVerticesOf(mi) ) {
|
||||
if ( bottom == null )
|
||||
bottom = mt;
|
||||
else if ( ! bottom.equals(mt) )
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// bottom has some connections coming in from other nodes, don't allow
|
||||
if ( inDegreeOf(bottom) != middles.size() )
|
||||
return false;
|
||||
|
||||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
// actually do the merging, returning true if at least 1 base was successfully split
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles);
|
||||
return splitter.splitAndUpdate(top, bottom, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge tail configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { A -> x + S_i + y }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { A -> x -> S_i -> y }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no bottom node is required
|
||||
*/
|
||||
protected class MergeTails extends VertexBasedTransformer {
|
||||
@Override
|
||||
protected boolean tryToTransform(final SeqVertex top) {
|
||||
final Set<SeqVertex> tails = outgoingVerticesOf(top);
|
||||
if ( tails.size() <= 1 )
|
||||
return false;
|
||||
|
||||
for ( final SeqVertex t : tails )
|
||||
if ( ! isSink(t) || inDegreeOf(t) > 1 )
|
||||
return false;
|
||||
|
||||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails);
|
||||
return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge headless configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no top node is required
|
||||
*/
|
||||
protected class MergeCommonSuffices extends VertexBasedTransformer {
|
||||
@Override
|
||||
boolean tryToTransform(final SeqVertex bottom) {
|
||||
return new SharedSequenceMerger().merge(SeqGraph.this, bottom);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge headless configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no top node is required
|
||||
*/
|
||||
protected class SplitCommonSuffices extends VertexBasedTransformer {
|
||||
final Set<SeqVertex> alreadySplit = new HashSet<SeqVertex>();
|
||||
|
||||
@Override
|
||||
boolean tryToTransform(final SeqVertex bottom) {
|
||||
if ( alreadySplit.contains(bottom) )
|
||||
return false;
|
||||
else {
|
||||
alreadySplit.add(bottom);
|
||||
return new CommonSuffixSplitter().split(SeqGraph.this, bottom);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge headless configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no top node is required
|
||||
*/
|
||||
protected class MergeHeadlessIncomingSources extends VertexBasedTransformer {
|
||||
@Override
|
||||
boolean tryToTransform(final SeqVertex bottom) {
|
||||
final Set<SeqVertex> incoming = incomingVerticesOf(bottom);
|
||||
if ( incoming.size() <= 1 )
|
||||
return false;
|
||||
|
||||
for ( final SeqVertex inc : incoming )
|
||||
if ( ! isSource(inc) || outDegreeOf(inc) > 1 )
|
||||
return false;
|
||||
|
||||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming);
|
||||
return splitter.splitAndUpdate(null, bottom, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -44,11 +44,10 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
|
|
@ -150,4 +149,19 @@ public class SeqVertex extends BaseVertex {
|
|||
final int prefixSize = sequence.length - suffix.length;
|
||||
return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new SeqVertex derived from this one but not including prefix or suffix bases
|
||||
*
|
||||
* @param prefix the previx bases to remove
|
||||
* @param suffix the suffix bases to remove from this vertex
|
||||
* @return a newly allocated SeqVertex
|
||||
*/
|
||||
@Requires("Utils.endsWith(sequence, suffix)")
|
||||
public SeqVertex withoutPrefixAndSuffix(final byte[] prefix, final byte[] suffix) {
|
||||
final int start = prefix.length;
|
||||
final int length = sequence.length - suffix.length - prefix.length;
|
||||
final int stop = start + length;
|
||||
return length > 0 ? new SeqVertex(Arrays.copyOfRange(sequence, start, stop)) : null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Merges the incoming vertices of a vertex V of a graph
|
||||
*
|
||||
* Looks at the vertices that are incoming to V (i.e., have an outgoing edge connecting to V). If
|
||||
* they all have the same sequence, merges them into the sequence of V, and updates the graph
|
||||
* as appropriate
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/22/13
|
||||
* Time: 8:31 AM
|
||||
*/
|
||||
public class SharedSequenceMerger {
|
||||
public SharedSequenceMerger() { }
|
||||
|
||||
/**
|
||||
* Attempt to merge the incoming vertices of v
|
||||
*
|
||||
* @param graph the graph containing the vertex v
|
||||
* @param v the vertex whose incoming vertices we want to merge
|
||||
* @return true if some useful merging was done, false otherwise
|
||||
*/
|
||||
public boolean merge(final SeqGraph graph, final SeqVertex v) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v);
|
||||
|
||||
final Set<SeqVertex> prevs = graph.incomingVerticesOf(v);
|
||||
if ( ! canMerge(graph, v, prevs) )
|
||||
return false;
|
||||
else {
|
||||
// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0);
|
||||
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
final byte[] prevSeq = prevs.iterator().next().getSequence();
|
||||
final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence()));
|
||||
graph.addVertex(newV);
|
||||
|
||||
for ( final SeqVertex prev : prevs ) {
|
||||
for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) {
|
||||
graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn));
|
||||
edgesToRemove.add(prevIn);
|
||||
}
|
||||
}
|
||||
|
||||
for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) {
|
||||
graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e));
|
||||
}
|
||||
|
||||
graph.removeAllVertices(prevs);
|
||||
graph.removeVertex(v);
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
|
||||
// graph.printGraph(new File("csm." + counter++ + "." + v.getSequenceString() + "_post.dot"), 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
//private static int counter = 0;
|
||||
|
||||
/**
|
||||
* Can we safely merge the incoming vertices of v
|
||||
*
|
||||
* @param graph the graph containing v and incomingVertices
|
||||
* @param v the vertex we want to merge into
|
||||
* @param incomingVertices the incoming vertices of v
|
||||
* @return true if we can safely merge incomingVertices
|
||||
*/
|
||||
private boolean canMerge(final SeqGraph graph, final SeqVertex v, final Collection<SeqVertex> incomingVertices) {
|
||||
if ( incomingVertices.isEmpty() )
|
||||
return false;
|
||||
|
||||
final SeqVertex first = incomingVertices.iterator().next();
|
||||
for ( final SeqVertex prev : incomingVertices) {
|
||||
if ( ! prev.seqEquals(first) )
|
||||
return false;
|
||||
final Collection<SeqVertex> prevOuts = graph.outgoingVerticesOf(prev);
|
||||
if ( prevOuts.size() != 1 )
|
||||
return false;
|
||||
if ( prevOuts.iterator().next() != v )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,300 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Split a collection of middle nodes in a graph into their shared prefix and suffix values
|
||||
*
|
||||
* This code performs the following transformation. Suppose I have a set of vertices V, such
|
||||
* that each vertex is composed of sequence such that
|
||||
*
|
||||
* Vi = prefix + seq_i + suffix
|
||||
*
|
||||
* where prefix and suffix are shared sequences across all vertices V
|
||||
*
|
||||
* This algorithm creates a new SeqGraph with the following configuration
|
||||
*
|
||||
* prefix -> has outgoing edges to all seq_i
|
||||
* suffix -> has incoming edges for all seq_i
|
||||
*
|
||||
* There are a few special cases that must be handled. First, Vi could be simply
|
||||
* == to the prefix or the suffix. These generate direct connections between
|
||||
* the prefix and suffix nodes, and they are handled internally by the algorithm.
|
||||
*
|
||||
* Note that for convenience, we will always create newTop and newBottom nodes, but
|
||||
* these may be empty node (i.e., they contain no sequence). That allows them to be
|
||||
* trivially merged, if desired, when the graph is incorporated into an overall
|
||||
* graph.
|
||||
*
|
||||
* The product of this operation is a SeqGraph that contains the split. There's a
|
||||
* function to merge reconnect this graph into the graph that contains the middle nodes
|
||||
*
|
||||
* The process guarentees a few things about the output:
|
||||
*
|
||||
* -- Preserves the paths and weights among all vertices
|
||||
*
|
||||
* It produces a graph that has some unusual properties
|
||||
*
|
||||
* -- May add nodes with no sequence (isEmpty() == true) to preserve connectivity among the graph
|
||||
* -- May introduce edges with no multiplicity to preserve paths through the graph
|
||||
*
|
||||
* The overall workflow of using this class is simple:
|
||||
*
|
||||
* find vertices V in graph that you want to split out
|
||||
* s = new SharedVertexSequenceSplitter(graph, V)
|
||||
* s.updateGraph(graph)
|
||||
*
|
||||
* to update the graph with the modifications created by this splitter
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/22/13
|
||||
* Time: 8:31 AM
|
||||
*/
|
||||
public class SharedVertexSequenceSplitter {
|
||||
final private SeqGraph outer;
|
||||
final protected SeqVertex prefixV, suffixV;
|
||||
final protected Collection<SeqVertex> toSplits;
|
||||
|
||||
// updated in split routine
|
||||
protected SeqGraph splitGraph = null;
|
||||
protected Collection<SeqVertex> newMiddles = null;
|
||||
protected List<BaseEdge> edgesToRemove = null;
|
||||
|
||||
/**
|
||||
* Create a new graph that contains the vertices in toSplitsArg with their shared suffix and prefix
|
||||
* sequences extracted out.
|
||||
*
|
||||
* @param graph the graph containing the vertices in toSplitsArg
|
||||
* @param toSplitsArg a collection of vertices to split. Must be contained within graph, and have only connections
|
||||
* from a single shared top and/or bottom node
|
||||
*/
|
||||
public SharedVertexSequenceSplitter(final SeqGraph graph, final Collection<SeqVertex> toSplitsArg) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( toSplitsArg == null ) throw new IllegalArgumentException("toSplitsArg cannot be null");
|
||||
if ( toSplitsArg.size() < 2 ) throw new IllegalArgumentException("Can only split at least 2 vertices but only got " + toSplitsArg);
|
||||
if ( ! graph.vertexSet().containsAll(toSplitsArg) ) throw new IllegalArgumentException("graph doesn't contain all of the vertices to split");
|
||||
|
||||
this.outer = graph;
|
||||
this.toSplits = toSplitsArg;
|
||||
|
||||
// all of the edges point to the same sink, so it's time to merge
|
||||
final Pair<SeqVertex, SeqVertex> prefixAndSuffix = commonPrefixAndSuffixOfVertices(toSplits);
|
||||
prefixV = prefixAndSuffix.getFirst();
|
||||
suffixV = prefixAndSuffix.getSecond();
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple single-function interface to split and then update a graph
|
||||
*
|
||||
* @see #updateGraph(SeqVertex, SeqVertex) for a full description of top and bottom
|
||||
*
|
||||
* @param top the top vertex, may be null
|
||||
* @param bottom the bottom vertex, may be null
|
||||
* @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up
|
||||
* before we'll go ahead and actually do the splitting. Allows one to determine
|
||||
* whether there's actually any useful splitting to do, as well as protect
|
||||
* yourself against spurious splitting of nodes based on trivial amounts of overall
|
||||
* @return true if some useful splitting was done, false otherwise
|
||||
*/
|
||||
public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) {
|
||||
if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence )
|
||||
return false;
|
||||
split();
|
||||
updateGraph(top, bottom);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually do the splitting up of the vertices
|
||||
*
|
||||
* Must be called before calling updateGraph
|
||||
*/
|
||||
public void split() {
|
||||
splitGraph = new SeqGraph();
|
||||
newMiddles = new LinkedList<SeqVertex>();
|
||||
edgesToRemove = new LinkedList<BaseEdge>();
|
||||
|
||||
splitGraph.addVertices(prefixV, suffixV);
|
||||
|
||||
for ( final SeqVertex mid : toSplits ) {
|
||||
final BaseEdge toMid = processEdgeToRemove(mid, outer.incomingEdgeOf(mid));
|
||||
final BaseEdge fromMid = processEdgeToRemove(mid, outer.outgoingEdgeOf(mid));
|
||||
|
||||
final SeqVertex remaining = mid.withoutPrefixAndSuffix(prefixV.getSequence(), suffixV.getSequence());
|
||||
if ( remaining != null ) {
|
||||
// there's some sequence prefix + seq + suffix, so add the node and make edges
|
||||
splitGraph.addVertex(remaining);
|
||||
newMiddles.add(remaining);
|
||||
// update edge from top -> middle to be top -> without suffix
|
||||
splitGraph.addEdge(prefixV, remaining, toMid);
|
||||
splitGraph.addEdge(remaining, suffixV, fromMid);
|
||||
} else {
|
||||
// prefix + suffix completely explain this node
|
||||
splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update graph outer, replacing the previous middle vertices that were split out with the new
|
||||
* graph structure of the split, linking this subgraph into the graph at top and bot (the
|
||||
* vertex connecting the middle nodes and the vertex outgoing of all middle node)
|
||||
*
|
||||
* @param top an optional top node that must have outgoing edges to all split vertices. If null, this subgraph
|
||||
* will be added without any incoming edges
|
||||
* @param bot an optional bottom node that must have incoming edges to all split vertices. If null, this subgraph
|
||||
* will be added without any outgoing edges to the rest of the graph
|
||||
*/
|
||||
public void updateGraph(final SeqVertex top, final SeqVertex bot) {
|
||||
if ( ! outer.vertexSet().containsAll(toSplits) ) throw new IllegalArgumentException("graph doesn't contain all of the original vertices to split");
|
||||
if ( top == null && bot == null ) throw new IllegalArgumentException("Cannot update graph without at least one top or bot vertex, but both were null");
|
||||
if ( top != null && ! outer.containsVertex(top) ) throw new IllegalArgumentException("top " + top + " not found in graph " + outer);
|
||||
if ( bot != null && ! outer.containsVertex(bot) ) throw new IllegalArgumentException("bot " + bot + " not found in graph " + outer);
|
||||
if ( splitGraph == null ) throw new IllegalStateException("Cannot call updateGraph until split() has been called");
|
||||
|
||||
outer.removeAllVertices(toSplits);
|
||||
outer.removeAllEdges(edgesToRemove);
|
||||
|
||||
outer.addVertices(newMiddles);
|
||||
|
||||
final boolean hasPrefixSuffixEdge = splitGraph.getEdge(prefixV, suffixV) != null;
|
||||
final boolean hasOnlyPrefixSuffixEdges = hasPrefixSuffixEdge && splitGraph.outDegreeOf(prefixV) == 1;
|
||||
final boolean needPrefixNode = ! prefixV.isEmpty() || (top == null && ! hasOnlyPrefixSuffixEdges);
|
||||
final boolean needSuffixNode = ! suffixV.isEmpty() || (bot == null && ! hasOnlyPrefixSuffixEdges);
|
||||
|
||||
// if prefix / suffix are needed, keep them
|
||||
final SeqVertex topForConnect = needPrefixNode ? prefixV : top;
|
||||
final SeqVertex botForConnect = needSuffixNode ? suffixV : bot;
|
||||
|
||||
if ( needPrefixNode ) {
|
||||
outer.addVertex(prefixV);
|
||||
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0));
|
||||
}
|
||||
|
||||
if ( needSuffixNode ) {
|
||||
outer.addVertex(suffixV);
|
||||
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0));
|
||||
}
|
||||
|
||||
if ( topForConnect != null ) {
|
||||
for ( final BaseEdge e : splitGraph.outgoingEdgesOf(prefixV) ) {
|
||||
final SeqVertex target = splitGraph.getEdgeTarget(e);
|
||||
|
||||
if ( target == suffixV ) { // going straight from prefix -> suffix
|
||||
if ( botForConnect != null )
|
||||
outer.addEdge(topForConnect, botForConnect, e);
|
||||
} else {
|
||||
outer.addEdge(topForConnect, target, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( botForConnect != null ) {
|
||||
for ( final BaseEdge e : splitGraph.incomingEdgesOf(suffixV) ) {
|
||||
outer.addEdge(splitGraph.getEdgeSource(e), botForConnect, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the longest suffix of bases shared among all provided vertices
|
||||
*
|
||||
* For example, if the vertices have sequences AC, CC, and ATC, this would return
|
||||
* a single C. However, for ACC and TCC this would return CC. And for AC and TG this
|
||||
* would return null;
|
||||
*
|
||||
* @param middleVertices a non-empty set of vertices
|
||||
* @return
|
||||
*/
|
||||
@Requires("!middleVertices.isEmpty()")
|
||||
protected static Pair<SeqVertex, SeqVertex> commonPrefixAndSuffixOfVertices(final Collection<SeqVertex> middleVertices) {
|
||||
final List<byte[]> kmers = new ArrayList<byte[]>(middleVertices.size());
|
||||
|
||||
int min = Integer.MAX_VALUE;
|
||||
for ( final SeqVertex v : middleVertices ) {
|
||||
kmers.add(v.getSequence());
|
||||
min = Math.min(min, v.getSequence().length);
|
||||
}
|
||||
|
||||
final int prefixLen = Utils.compPrefixLen(kmers, min);
|
||||
final int suffixLen = Utils.compSuffixLen(kmers, min - prefixLen);
|
||||
|
||||
final byte[] kmer = kmers.get(0);
|
||||
final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen);
|
||||
final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length);
|
||||
return new Pair<SeqVertex, SeqVertex>(new SeqVertex(prefix), new SeqVertex(suffix));
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that returns an edge that we should use for splitting
|
||||
*
|
||||
* If e is null, creates a new 0 multiplicity edge, set to ref is any edges to V are ref
|
||||
* If e is not null, returns a new copy of e, and schedules e for removal
|
||||
*
|
||||
* @param e a non-null edge
|
||||
* @return a non-null edge
|
||||
*/
|
||||
@Requires("v != null")
|
||||
@Ensures("result != null")
|
||||
private BaseEdge processEdgeToRemove(final SeqVertex v, final BaseEdge e) {
|
||||
if ( e == null ) {
|
||||
// there's no edge, so we return a newly allocated one and don't schedule e for removal
|
||||
// the weight must be 0 to preserve sum through the diamond
|
||||
return new BaseEdge(outer.isReferenceNode(v), 0);
|
||||
} else {
|
||||
// schedule edge for removal, and return a freshly allocated one for our graph to use
|
||||
edgesToRemove.add(e);
|
||||
return new BaseEdge(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Utility functions used in the graphs package
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/25/13
|
||||
* Time: 9:42 PM
|
||||
*/
|
||||
final class Utils {
|
||||
private Utils() {}
|
||||
|
||||
/**
|
||||
* Compute the maximum shared prefix length of list of bytes.
|
||||
*
|
||||
* @param listOfBytes a list of bytes with at least one element
|
||||
* @param minLength the min. length among all byte[] in listOfBytes
|
||||
* @return the number of shared bytes common at the start of all bytes
|
||||
*/
|
||||
@Requires({"listOfBytes.size() >= 1", "minLength >= 0"})
|
||||
@Ensures("result >= 0")
|
||||
protected static int compPrefixLen(final List<byte[]> listOfBytes, final int minLength) {
|
||||
for ( int i = 0; i < minLength; i++ ) {
|
||||
final byte b = listOfBytes.get(0)[i];
|
||||
for ( int j = 1; j < listOfBytes.size(); j++ ) {
|
||||
if ( b != listOfBytes.get(j)[i] )
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return minLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the maximum shared suffix length of list of bytes.
|
||||
*
|
||||
* @param listOfBytes a list of bytes with at least one element
|
||||
* @param minLength the min. length among all byte[] in listOfBytes
|
||||
* @return the number of shared bytes common at the end of all bytes
|
||||
*/
|
||||
@Requires({"listOfBytes.size() >= 1", "minLength >= 0"})
|
||||
@Ensures("result >= 0")
|
||||
protected static int compSuffixLen(final List<byte[]> listOfBytes, final int minLength) {
|
||||
for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) {
|
||||
final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1];
|
||||
for ( int j = 1; j < listOfBytes.size(); j++ ) {
|
||||
if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] )
|
||||
return suffixLen;
|
||||
}
|
||||
}
|
||||
return minLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of kmers as byte[] from the vertices in the graph
|
||||
*
|
||||
* @param vertices a collection of vertices
|
||||
* @return a list of their kmers in order of the iterator on vertices
|
||||
*/
|
||||
protected static List<byte[]> getKmers(final Collection<SeqVertex> vertices) {
|
||||
final List<byte[]> kmers = new ArrayList<byte[]>(vertices.size());
|
||||
for ( final SeqVertex v : vertices ) {
|
||||
kmers.add(v.getSequence());
|
||||
}
|
||||
return kmers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the minimum length of a collection of byte[]
|
||||
*
|
||||
* @param kmers a list of kmers whose .length min we want
|
||||
* @return the min of the kmers, if kmers is empty the result is 0
|
||||
*/
|
||||
protected static int minKmerLength(final Collection<byte[]> kmers) {
|
||||
if ( kmers == null ) throw new IllegalArgumentException("kmers cannot be null");
|
||||
|
||||
if ( kmers.isEmpty() ) return 0;
|
||||
int min = Integer.MAX_VALUE;
|
||||
for ( final byte[] kmer : kmers ) {
|
||||
min = Math.min(min, kmer.length);
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -56,6 +56,7 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
|
@ -72,8 +73,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
|||
public void testReferenceCycleGraph() {
|
||||
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
|
||||
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
|
||||
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), false);
|
||||
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), false);
|
||||
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true));
|
||||
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true));
|
||||
|
||||
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
|
||||
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6dd29d6fec056419ab0fa03a7d43d85e");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9fa4d3c88fd9c0f23c7a3ddd3d24a8c");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -75,7 +75,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
// TODO -- need a better symbolic allele test
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8225fb59b9fcbe767a473c9eb8b21537");
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554");
|
||||
}
|
||||
|
||||
private void HCTestComplexGGA(String bam, String args, String md5) {
|
||||
|
|
@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
||||
"84616464aed68f4d9bc9e08472eff9c0");
|
||||
"e8ffbfae3c1af5be02631a31f386a431");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"e2d1023b846bfac31b4f7a3a4b90d931");
|
||||
"c3a98b19efa7cb36fe5f5f2ab893ef56");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "9859b136d05085b5ec0833035289106a");
|
||||
HCTest(CEUTRIO_BAM, "", "45856ad67bfe8d8bea45808d8258bcf1");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "27f660bf1c9a6ed7167d77022d401b73");
|
||||
HCTest(NA12878_BAM, "", "b6c93325f851ac358ea49260fb11b75c");
|
||||
}
|
||||
|
||||
@Test(enabled = false) // can't annotate the rsID's yet
|
||||
|
|
@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"e25fc2196401a16347e0c730dbcbe828");
|
||||
"4ca6b560d0569cdca400d3e50915e211");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "325d7d73e0bd86b6cb146b249eda959a");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5d06ec5502d3f157964bd7b275d6a0cb");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("53a50dae68f0175ca3088dea1d3bb881"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec97a0a65890169358842e765ff8dd15"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d3bc6adde8cd9514ae5c49cd366d5de4"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("5280f1a50ca27d8e435da0bd5b26ae93"));
|
||||
Arrays.asList("4adb833ed8af20224b76bba61e2b0d93"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9"));
|
||||
Arrays.asList("1704b0901c86f8f597d931222d5c8dd8"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -102,4 +102,20 @@ public class BaseEdgeUnitTest extends BaseTest {
|
|||
Assert.assertEquals(edges.get(2), e2);
|
||||
Assert.assertEquals(edges.get(3), e1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMax() {
|
||||
for ( final boolean firstIsRef : Arrays.asList(true, false) ) {
|
||||
for ( final boolean secondIsRef : Arrays.asList(true, false) ) {
|
||||
for ( final int firstMulti : Arrays.asList(1, 4) ) {
|
||||
for ( final int secondMulti : Arrays.asList(2, 3) ) {
|
||||
final BaseEdge expected = new BaseEdge(firstIsRef || secondIsRef, Math.max(firstMulti, secondMulti));
|
||||
final BaseEdge actual = new BaseEdge(firstIsRef, firstMulti).max(new BaseEdge(secondIsRef, secondMulti));
|
||||
Assert.assertEquals(actual.getMultiplicity(), expected.getMultiplicity());
|
||||
Assert.assertEquals(actual.isRef(), expected.isRef());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -44,27 +44,17 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
import scala.actors.threadpool.Arrays;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 3/15/13
|
||||
* Time: 3:36 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class BaseGraphUnitTest extends BaseTest {
|
||||
SeqGraph graph;
|
||||
SeqVertex v1, v2, v3, v4, v5;
|
||||
|
|
@ -105,6 +95,137 @@ public class BaseGraphUnitTest extends BaseTest {
|
|||
assertVertexSetEquals(graph.incomingVerticesOf(v5), v4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveSingletonOrphanVertices() throws Exception {
|
||||
// all vertices in graph are connected
|
||||
final List<SeqVertex> kept = new LinkedList<SeqVertex>(graph.vertexSet());
|
||||
final SeqVertex rm1 = new SeqVertex("CAGT");
|
||||
final SeqVertex rm2 = new SeqVertex("AGTC");
|
||||
graph.addVertices(rm1, rm2);
|
||||
Assert.assertEquals(graph.vertexSet().size(), kept.size() + 2);
|
||||
final BaseEdge rm12e = new BaseEdge(false, 1);
|
||||
graph.addEdge(rm1, rm2, rm12e);
|
||||
|
||||
final SeqGraph original = (SeqGraph)graph.clone();
|
||||
graph.removeSingletonOrphanVertices();
|
||||
Assert.assertTrue(BaseGraph.graphEquals(original, graph), "Graph with disconnected component but edges between components shouldn't be modified");
|
||||
|
||||
graph.removeEdge(rm12e); // now we should be able to remove rm1 and rm2
|
||||
graph.removeSingletonOrphanVertices();
|
||||
Assert.assertTrue(graph.vertexSet().containsAll(kept));
|
||||
Assert.assertFalse(graph.containsVertex(rm1));
|
||||
Assert.assertFalse(graph.containsVertex(rm2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemovePathsNotConnectedToRef() throws Exception {
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
|
||||
SeqVertex src = new SeqVertex("A");
|
||||
SeqVertex end = new SeqVertex("A");
|
||||
SeqVertex g1 = new SeqVertex("C");
|
||||
SeqVertex g2 = new SeqVertex("G");
|
||||
SeqVertex g3 = new SeqVertex("T");
|
||||
SeqVertex g4 = new SeqVertex("AA");
|
||||
SeqVertex g5 = new SeqVertex("AA");
|
||||
SeqVertex g6 = new SeqVertex("AA");
|
||||
SeqVertex g8 = new SeqVertex("AA");
|
||||
SeqVertex g7 = new SeqVertex("AA");
|
||||
SeqVertex b1 = new SeqVertex("CC");
|
||||
SeqVertex b2 = new SeqVertex("GG");
|
||||
SeqVertex b3 = new SeqVertex("TT");
|
||||
SeqVertex b4 = new SeqVertex("AAA");
|
||||
SeqVertex b5 = new SeqVertex("CCC");
|
||||
SeqVertex b6 = new SeqVertex("GGG");
|
||||
SeqVertex b7 = new SeqVertex("AAAA");
|
||||
SeqVertex b8 = new SeqVertex("GGGG");
|
||||
SeqVertex b9 = new SeqVertex("CCCC");
|
||||
|
||||
graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8);
|
||||
graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end);
|
||||
graph.addEdges(src, g1, g5, g6, g7, end);
|
||||
graph.addEdges(src, g1, g5, g8, g7, end);
|
||||
graph.addEdges(src, g1, g3, end);
|
||||
|
||||
// the current state of the graph is the good one
|
||||
final SeqGraph good = (SeqGraph)graph.clone();
|
||||
|
||||
// now add the bads to the graph
|
||||
graph.addVertices(b1, b2, b3, b4, b5, b6, b7, b8, b9);
|
||||
graph.addEdges(src, b1); // source -> b1 is dead
|
||||
graph.addEdges(b6, src); // x -> source is bad
|
||||
graph.addEdges(g4, b2); // off random vertex is bad
|
||||
graph.addEdges(g3, b3, b4); // two vertices that don't connect to end are bad
|
||||
graph.addEdges(end, b5); // vertex off end is bad
|
||||
graph.addEdges(g3, b7, b8, b7); // cycle is bad
|
||||
graph.addEdges(g3, b9, b9); // self-cycle is bad
|
||||
|
||||
final boolean debug = false;
|
||||
if ( debug ) good.printGraph(new File("expected.dot"), 0);
|
||||
if ( debug ) graph.printGraph(new File("bad.dot"), 0);
|
||||
graph.removePathsNotConnectedToRef();
|
||||
if ( debug ) graph.printGraph(new File("actual.dot"), 0);
|
||||
|
||||
Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception {
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
|
||||
SeqVertex src = new SeqVertex("A");
|
||||
SeqVertex end = new SeqVertex("A");
|
||||
SeqVertex g1 = new SeqVertex("C");
|
||||
SeqVertex g2 = new SeqVertex("G");
|
||||
SeqVertex g3 = new SeqVertex("T");
|
||||
SeqVertex g4 = new SeqVertex("AA");
|
||||
SeqVertex g5 = new SeqVertex("AA");
|
||||
SeqVertex g6 = new SeqVertex("AA");
|
||||
SeqVertex g8 = new SeqVertex("AA");
|
||||
SeqVertex g7 = new SeqVertex("AA");
|
||||
SeqVertex gPrev = new SeqVertex("AA");
|
||||
SeqVertex gPrev1 = new SeqVertex("AA");
|
||||
SeqVertex gPrev2 = new SeqVertex("AA");
|
||||
SeqVertex gAfter = new SeqVertex("AA");
|
||||
SeqVertex gAfter1 = new SeqVertex("AA");
|
||||
SeqVertex gAfter2 = new SeqVertex("AA");
|
||||
SeqVertex b1 = new SeqVertex("CC");
|
||||
SeqVertex b2 = new SeqVertex("GG");
|
||||
SeqVertex b3 = new SeqVertex("TT");
|
||||
SeqVertex b4 = new SeqVertex("AAA");
|
||||
SeqVertex b5 = new SeqVertex("CCC");
|
||||
SeqVertex b6 = new SeqVertex("GGG");
|
||||
|
||||
graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2);
|
||||
graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end);
|
||||
graph.addEdges(src, g1, g5, g6, g7, end);
|
||||
graph.addEdges(src, g1, g5, g8, g7, end);
|
||||
graph.addEdges(src, g1, g3, end);
|
||||
|
||||
// these should be kept, but are in the wrong direction
|
||||
graph.addEdges(gPrev, src);
|
||||
graph.addEdges(gPrev1, gPrev2, src);
|
||||
graph.addEdges(end, gAfter);
|
||||
graph.addEdges(end, gAfter1, gAfter2);
|
||||
|
||||
// the current state of the graph is the good one
|
||||
final SeqGraph good = (SeqGraph)graph.clone();
|
||||
|
||||
// now add the bads to the graph
|
||||
graph.addVertices(b1, b2, b3, b4, b5, b6);
|
||||
graph.addEdges(b2, b3); // b2 -> b3
|
||||
graph.addEdges(b4, b5, b4); // cycle
|
||||
graph.addEdges(b6, b6); // isolated self cycle
|
||||
|
||||
final boolean debug = false;
|
||||
if ( debug ) good.printGraph(new File("expected.dot"), 0);
|
||||
if ( debug ) graph.printGraph(new File("bad.dot"), 0);
|
||||
graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
|
||||
if ( debug ) graph.printGraph(new File("actual.dot"), 0);
|
||||
|
||||
Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrintEmptyGraph() throws Exception {
|
||||
final File tmp = File.createTempFile("tmp", "dot");
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -68,9 +68,10 @@ public class BaseVertexUnitTest extends BaseTest {
|
|||
new BaseVertex((byte[])null);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
@Test()
|
||||
public void testCreationEmptySeq() {
|
||||
new BaseVertex(new byte[0]);
|
||||
final BaseVertex v = new BaseVertex(new byte[0]);
|
||||
Assert.assertTrue(v.isEmpty(), "Version with length == 0 should be empty");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
public class CommonSuffixMergerUnitTest extends BaseTest {
|
||||
private final static boolean PRINT_GRAPHS = true;
|
||||
|
||||
@DataProvider(name = "CompleteCycleData")
|
||||
public Object[][] makeCompleteCycleData() {
|
||||
return makeSplitMergeData(-1);
|
||||
}
|
||||
|
||||
public static class SplitMergeData {
|
||||
final SeqGraph graph;
|
||||
final SeqVertex v;
|
||||
final String commonSuffix;
|
||||
|
||||
public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) {
|
||||
this.graph = graph;
|
||||
this.v = v;
|
||||
this.commonSuffix = commonSuffix;
|
||||
}
|
||||
}
|
||||
|
||||
public static Object[][] makeSplitMergeData(final int maxTests) {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<String> bases = Arrays.asList("A", "C", "G", "T");
|
||||
for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) {
|
||||
for ( final int nBots : Arrays.asList(0, 1, 2) ) {
|
||||
for ( final int nMids : Arrays.asList(1, 2, 3) ) {
|
||||
for ( int nTops = 0; nTops < nMids; nTops++ ) {
|
||||
for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) {
|
||||
int multi = 1;
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
final SeqVertex v = new SeqVertex("GGGG");
|
||||
graph.addVertex(v);
|
||||
|
||||
final LinkedList<SeqVertex> tops = new LinkedList<SeqVertex>();
|
||||
final LinkedList<SeqVertex> mids = new LinkedList<SeqVertex>();
|
||||
|
||||
for ( int i = 0; i < nMids; i++) {
|
||||
final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix);
|
||||
graph.addVertex(mid);
|
||||
graph.addEdge(mid, v, new BaseEdge(i == 0, multi++));
|
||||
mids.add(mid);
|
||||
|
||||
tops.add(new SeqVertex(bases.get(i)));
|
||||
}
|
||||
|
||||
graph.addVertices(tops);
|
||||
for ( final SeqVertex t : tops ) {
|
||||
for ( int i = 0; i < nTopConnections; i++ ) {
|
||||
graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++));
|
||||
}
|
||||
}
|
||||
|
||||
for ( int i = 0; i < nBots; i++ ) {
|
||||
final SeqVertex bot = new SeqVertex(bases.get(i));
|
||||
graph.addVertex(bot);
|
||||
graph.addEdge(v, bot, new BaseEdge(i == 0, multi++));
|
||||
|
||||
}
|
||||
|
||||
tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final List<Object[]> toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests));
|
||||
return toUse.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) {
|
||||
try {
|
||||
final Set<String> haplotypes = new HashSet<String>();
|
||||
final List<Path<SeqVertex>> originalPaths = new KBestPaths<SeqVertex>().getKBestPaths(original);
|
||||
for ( final Path<SeqVertex> path : originalPaths )
|
||||
haplotypes.add(new String(path.getBases()));
|
||||
|
||||
final List<Path<SeqVertex>> splitPaths = new KBestPaths<SeqVertex>().getKBestPaths(actual);
|
||||
for ( final Path<SeqVertex> path : splitPaths ) {
|
||||
final String h = new String(path.getBases());
|
||||
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
|
||||
}
|
||||
|
||||
if ( splitPaths.size() == originalPaths.size() ) {
|
||||
for ( int i = 0; i < originalPaths.size(); i++ ) {
|
||||
Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
|
||||
}
|
||||
}
|
||||
} catch ( AssertionError e ) {
|
||||
if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0);
|
||||
if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CompleteCycleData")
|
||||
public void testMerging(final SplitMergeData data) {
|
||||
final SeqGraph original = (SeqGraph)data.graph.clone();
|
||||
final SharedSequenceMerger splitter = new SharedSequenceMerger();
|
||||
splitter.merge(data.graph, data.v);
|
||||
assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
public class CommonSuffixSplitterUnitTest extends BaseTest {
|
||||
@DataProvider(name = "SplitData")
|
||||
public Object[][] makeSplitData() {
|
||||
return CommonSuffixMergerUnitTest.makeSplitMergeData(-1);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SplitData")
|
||||
public void testSplit(final CommonSuffixMergerUnitTest.SplitMergeData data) {
|
||||
final boolean expectedMerge = ! data.commonSuffix.isEmpty() && data.graph.inDegreeOf(data.v) > 1;
|
||||
|
||||
final SeqGraph original = (SeqGraph)data.graph.clone();
|
||||
// original.printGraph(new File("original.dot"), 0);
|
||||
final CommonSuffixSplitter splitter = new CommonSuffixSplitter();
|
||||
final boolean succeed = splitter.split(data.graph, data.v);
|
||||
// data.graph.printGraph(new File("actual.dot"), 0);
|
||||
Assert.assertEquals(succeed, expectedMerge, "Not excepted merge success/fail result");
|
||||
if ( succeed ) {
|
||||
Assert.assertEquals(data.graph.incomingVerticesOf(data.v).iterator().next().getSequenceString(), data.commonSuffix, "Common suffix not computed correctly");
|
||||
}
|
||||
|
||||
CommonSuffixMergerUnitTest.assertSameHaplotypes(String.format("suffixSplit.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplitPrevHaveMultipleEdges() {
|
||||
final SeqGraph original = new SeqGraph();
|
||||
final SeqVertex v1 = new SeqVertex("A");
|
||||
final SeqVertex v2 = new SeqVertex("A");
|
||||
final SeqVertex v3 = new SeqVertex("A");
|
||||
final SeqVertex v4 = new SeqVertex("A");
|
||||
|
||||
original.addVertices(v1, v2, v3, v4);
|
||||
original.addEdges(v1, v3);
|
||||
|
||||
Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with only one vertex");
|
||||
|
||||
original.addEdges(v2, v3);
|
||||
original.addEdges(v2, v4);
|
||||
|
||||
Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with multiple outgoing edges from middle nodes");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplitNoCycles() {
|
||||
final SeqGraph original = new SeqGraph();
|
||||
final SeqVertex v1 = new SeqVertex("A");
|
||||
final SeqVertex v2 = new SeqVertex("C");
|
||||
final SeqVertex v3 = new SeqVertex("C");
|
||||
final SeqVertex v4 = new SeqVertex("G");
|
||||
|
||||
original.addVertices(v1, v2, v3, v4);
|
||||
original.addEdges(v1, v3, v4);
|
||||
original.addEdges(v1, v2, v4);
|
||||
|
||||
Assert.assertTrue(new CommonSuffixSplitter().split((SeqGraph)original.clone(), v4), "Should be able to split pre-cycle graph");
|
||||
|
||||
original.addEdges(v4, v4);
|
||||
Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list");
|
||||
}
|
||||
}
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
|
|
@ -1,57 +1,56 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
|
@ -181,7 +180,9 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "IsDiamondData", enabled = true)
|
||||
public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) {
|
||||
Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond);
|
||||
final SeqGraph.MergeDiamonds merger = graph.new MergeDiamonds();
|
||||
merger.setDontModifyGraphEvenIfPossible();
|
||||
Assert.assertEquals(merger.tryToTransform(v), isRootOfDiamond);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MergingData")
|
||||
|
|
@ -267,7 +268,7 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
tests.add(new Object[]{graph.clone(), expected.clone()});
|
||||
}
|
||||
|
||||
{
|
||||
{ // all the nodes -> lots of merging and motion of nodes
|
||||
final SeqGraph all = new SeqGraph();
|
||||
all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2);
|
||||
all.addEdges(pre1, top, middle1, bottom, tail1);
|
||||
|
|
@ -277,9 +278,13 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
final SeqVertex newMiddle1 = new SeqVertex("G");
|
||||
final SeqVertex newMiddle2 = new SeqVertex("T");
|
||||
final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString());
|
||||
expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2);
|
||||
expected.addEdges(pre1, top, newMiddle1, newBottom, tail1);
|
||||
expected.addEdges(pre2, top, newMiddle2, newBottom, tail2);
|
||||
final SeqVertex newTop = new SeqVertex("A");
|
||||
final SeqVertex newTopDown1 = new SeqVertex("G");
|
||||
final SeqVertex newTopDown2 = new SeqVertex("C");
|
||||
final SeqVertex newTopBottomMerged = new SeqVertex("TA");
|
||||
expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2);
|
||||
expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1);
|
||||
expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2);
|
||||
tests.add(new Object[]{all.clone(), expected.clone()});
|
||||
}
|
||||
|
||||
|
|
@ -309,8 +314,17 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
@Test(dataProvider = "MergingData", enabled = true)
|
||||
public void testMerging(final SeqGraph graph, final SeqGraph expected) {
|
||||
final SeqGraph merged = (SeqGraph)graph.clone();
|
||||
merged.simplifyGraph();
|
||||
Assert.assertTrue(SeqGraph.graphEquals(merged, expected));
|
||||
merged.simplifyGraph(1);
|
||||
try {
|
||||
Assert.assertTrue(SeqGraph.graphEquals(merged, expected));
|
||||
} catch (AssertionError e) {
|
||||
// if ( ! SeqGraph.graphEquals(merged, expected) ) {
|
||||
// graph.printGraph(new File("graph.dot"), 0);
|
||||
// merged.printGraph(new File("merged.dot"), 0);
|
||||
// expected.printGraph(new File("expected.dot"), 0);
|
||||
// }
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// A -> ACT -> C [non-ref]
|
||||
|
|
@ -332,13 +346,9 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
graph.addEdge(mid1, bot, new BaseEdge(true, 1));
|
||||
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
expected.addVertices(top, mid1, bot);
|
||||
expected.addEdge(top, mid1, new BaseEdge(true, 2));
|
||||
expected.addEdge(mid1, bot, new BaseEdge(true, 2));
|
||||
|
||||
expected.addVertex(new SeqVertex("AACTC"));
|
||||
final SeqGraph actual = ((SeqGraph)graph.clone());
|
||||
actual.mergeBranchingNodes();
|
||||
|
||||
Assert.assertTrue(BaseGraph.graphEquals(actual, expected));
|
||||
actual.simplifyGraph();
|
||||
Assert.assertTrue(BaseGraph.graphEquals(actual, expected), "Wrong merging result after complete merging");
|
||||
}
|
||||
}
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -52,7 +52,6 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class SeqVertexUnitTest extends BaseTest {
|
||||
|
|
@ -0,0 +1,253 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
|
||||
private final static boolean PRINT_GRAPHS = false;
|
||||
|
||||
@DataProvider(name = "PrefixSuffixData")
|
||||
public Object[][] makePrefixSuffixData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0});
|
||||
tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0});
|
||||
tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1});
|
||||
tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1});
|
||||
tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0});
|
||||
tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0});
|
||||
tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0});
|
||||
tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1});
|
||||
tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4});
|
||||
|
||||
tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0});
|
||||
tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0});
|
||||
tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0});
|
||||
tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0});
|
||||
tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1});
|
||||
tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1});
|
||||
tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0});
|
||||
tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3});
|
||||
tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3});
|
||||
tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0});
|
||||
|
||||
tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PrefixSuffixData")
|
||||
public void testPrefixSuffix(final List<String> strings, int expectedPrefixLen, int expectedSuffixLen) {
|
||||
final List<byte[]> bytes = new ArrayList<byte[]>();
|
||||
int min = Integer.MAX_VALUE;
|
||||
for ( final String s : strings ) {
|
||||
bytes.add(s.getBytes());
|
||||
min = Math.min(min, s.length());
|
||||
}
|
||||
|
||||
final int actualPrefixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compPrefixLen(bytes, min);
|
||||
Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test");
|
||||
|
||||
final int actualSuffixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compSuffixLen(bytes, min - actualPrefixLen);
|
||||
Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test");
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PrefixSuffixData")
|
||||
public void testPrefixSuffixVertices(final List<String> strings, int expectedPrefixLen, int expectedSuffixLen) {
|
||||
final List<SeqVertex> v = new ArrayList<SeqVertex>();
|
||||
for ( final String s : strings ) {
|
||||
v.add(new SeqVertex(s));
|
||||
}
|
||||
|
||||
final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen);
|
||||
final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen);
|
||||
|
||||
final Pair<SeqVertex, SeqVertex> result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v);
|
||||
Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test");
|
||||
Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test");
|
||||
|
||||
Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty());
|
||||
Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty());
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PrefixSuffixData")
|
||||
public void testSplitter(final List<String> strings, int expectedPrefixLen, int expectedSuffixLen) {
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
|
||||
final List<SeqVertex> v = new ArrayList<SeqVertex>();
|
||||
for ( final String s : strings ) {
|
||||
v.add(new SeqVertex(s));
|
||||
}
|
||||
|
||||
graph.addVertices(v.toArray(new SeqVertex[]{}));
|
||||
|
||||
final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen);
|
||||
final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen);
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
|
||||
splitter.split();
|
||||
// splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0);
|
||||
|
||||
Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix);
|
||||
Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix);
|
||||
|
||||
Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size());
|
||||
Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0);
|
||||
|
||||
Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size());
|
||||
Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0);
|
||||
|
||||
for ( final SeqVertex mid : splitter.newMiddles ) {
|
||||
Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid));
|
||||
Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV));
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "CompleteCycleData")
|
||||
public Object[][] makeCompleteCycleData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final boolean hasTop : Arrays.asList(true, false) ) {
|
||||
for ( final boolean hasBot : Arrays.asList(true, false) ) {
|
||||
if ( ! hasTop && ! hasBot ) continue;
|
||||
tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot});
|
||||
|
||||
tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot});
|
||||
tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot});
|
||||
// all merged
|
||||
tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot});
|
||||
// prefix and suffix
|
||||
tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot});
|
||||
// 2 -> prefix, leave C
|
||||
tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot});
|
||||
// 2 -> prefix, leave CCC
|
||||
tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot});
|
||||
// 2 -> suffix, leave A/T
|
||||
tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot});
|
||||
// 2 -> suffix, leave T, delete 1
|
||||
tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CompleteCycleData")
|
||||
public void testSplitterCompleteCycle(final List<String> strings, final boolean hasTop, final boolean hasBot) {
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
|
||||
int edgeWeight = 1;
|
||||
final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null;
|
||||
final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null;
|
||||
final List<SeqVertex> v = new ArrayList<SeqVertex>();
|
||||
for ( final String s : strings ) {
|
||||
v.add(new SeqVertex(s));
|
||||
}
|
||||
graph.addVertices(v.toArray(new SeqVertex[]{}));
|
||||
final SeqVertex first = v.get(0);
|
||||
|
||||
if ( hasTop ) {
|
||||
graph.addVertex(top);
|
||||
for ( final SeqVertex vi : v )
|
||||
graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++));
|
||||
}
|
||||
|
||||
if ( hasBot ) {
|
||||
graph.addVertex(bot);
|
||||
for ( final SeqVertex vi : v )
|
||||
graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++));
|
||||
}
|
||||
|
||||
final Set<String> haplotypes = new HashSet<String>();
|
||||
final List<Path<SeqVertex>> originalPaths = new KBestPaths<SeqVertex>().getKBestPaths((SeqGraph)graph.clone());
|
||||
for ( final Path<SeqVertex> path : originalPaths )
|
||||
haplotypes.add(new String(path.getBases()));
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
|
||||
splitter.split();
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0);
|
||||
splitter.updateGraph(top, bot);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0);
|
||||
|
||||
final List<Path<SeqVertex>> splitPaths = new KBestPaths<SeqVertex>().getKBestPaths(graph);
|
||||
for ( final Path<SeqVertex> path : splitPaths ) {
|
||||
final String h = new String(path.getBases());
|
||||
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
|
||||
}
|
||||
|
||||
if ( splitPaths.size() == originalPaths.size() ) {
|
||||
for ( int i = 0; i < originalPaths.size(); i++ ) {
|
||||
Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
public class MostLikelyAlleleUnitTest extends BaseTest {
|
||||
final Allele a = Allele.create("A");
|
||||
|
||||
@Test
|
||||
public void testBasicCreation() {
|
||||
final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1;
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), a);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second);
|
||||
|
||||
Assert.assertEquals(mla.isInformative(), true);
|
||||
Assert.assertEquals(mla.isInformative(10), false);
|
||||
Assert.assertEquals(mla.isInformative(0), true);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(), a);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(0), a);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNotDefaultInformative() {
|
||||
final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2);
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
|
||||
Assert.assertEquals(mla.isInformative(), false);
|
||||
Assert.assertEquals(mla.isInformative(10), false);
|
||||
Assert.assertEquals(mla.isInformative(0), true);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(0), a);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreationNoGoodSecond() {
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), a);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
|
||||
|
||||
Assert.assertEquals(mla.isInformative(), true);
|
||||
Assert.assertEquals(mla.isInformative(10), true);
|
||||
Assert.assertEquals(mla.isInformative(0), true);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(), a);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(10), a);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(0), a);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreationNoAllele() {
|
||||
MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
|
||||
|
||||
Assert.assertEquals(mla.isInformative(), false);
|
||||
Assert.assertEquals(mla.isInformative(10), false);
|
||||
Assert.assertEquals(mla.isInformative(0), false);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getAlleleIfInformative(0), Allele.NO_CALL);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,32 +1,34 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
|
||||
/**
|
||||
|
|
@ -35,9 +37,17 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
|||
* @author mdepristo
|
||||
*/
|
||||
public class HCMappingQualityFilter extends ReadFilter {
|
||||
private final static Logger logger = Logger.getLogger(HCMappingQualityFilter.class);
|
||||
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for analysis with the HaplotypeCaller", required = false)
|
||||
public int MIN_MAPPING_QUALTY_SCORE = 20;
|
||||
|
||||
@Override
|
||||
public void initialize(GenomeAnalysisEngine engine) {
|
||||
if ( MIN_MAPPING_QUALTY_SCORE > 0 )
|
||||
logger.info("Filtering out reads with MAPQ < " + MIN_MAPPING_QUALTY_SCORE);
|
||||
}
|
||||
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,127 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
/**
|
||||
* Stores the most likely and second most likely alleles, along with a threshold
|
||||
* for assuming computing that a read is informative.
|
||||
*
|
||||
* If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
* then the most likely allele is set to "no call", and isInformative will return false. This constant can be
|
||||
* overridden simply by using one of the version of these calls that accepts informative threshold as an argument.
|
||||
*
|
||||
* For convenience, there are functions called getAlleleIfInformative that return either the most likely allele, or
|
||||
* NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD of one another.
|
||||
*
|
||||
* By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the
|
||||
* corresponding key
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/24/13
|
||||
* Time: 1:39 PM
|
||||
*/
|
||||
public final class MostLikelyAllele {
|
||||
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2;
|
||||
|
||||
final Allele mostLikely;
|
||||
final double log10LikelihoodOfMostLikely;
|
||||
final double log10LikelihoodOfSecondBest;
|
||||
|
||||
/**
|
||||
* Create a new MostLikelyAllele
|
||||
*
|
||||
* If there's a meaningful most likely allele, allele should be a real allele. If none can be determined,
|
||||
* mostLikely should be a NO_CALL allele.
|
||||
*
|
||||
* @param mostLikely the most likely allele
|
||||
* @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele
|
||||
* @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available)
|
||||
*/
|
||||
public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
|
||||
if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null");
|
||||
if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) )
|
||||
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely);
|
||||
if ( log10LikelihoodOfSecondBest != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfSecondBest) )
|
||||
throw new IllegalArgumentException("log10LikelihoodOfSecondBest must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfSecondBest);
|
||||
if ( log10LikelihoodOfMostLikely < log10LikelihoodOfSecondBest )
|
||||
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest);
|
||||
|
||||
this.mostLikely = mostLikely;
|
||||
this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely;
|
||||
this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest;
|
||||
}
|
||||
|
||||
public Allele getMostLikelyAllele() {
|
||||
return mostLikely;
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfMostLikely() {
|
||||
return log10LikelihoodOfMostLikely;
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfSecondBest() {
|
||||
return log10LikelihoodOfSecondBest;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #isInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
*/
|
||||
public boolean isInformative() {
|
||||
return isInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Was this allele selected from an object that was specifically informative about the allele?
|
||||
*
|
||||
* The calculation that implements this is whether the likelihood of the most likely allele is larger
|
||||
* than the second most likely by at least the log10ThresholdForInformative
|
||||
*
|
||||
* @return true if so, false if not
|
||||
*/
|
||||
public boolean isInformative(final double log10ThresholdForInformative) {
|
||||
return getLog10LikelihoodOfMostLikely() - getLog10LikelihoodOfSecondBest() > log10ThresholdForInformative;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getAlleleIfInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
*/
|
||||
public Allele getAlleleIfInformative() {
|
||||
return getAlleleIfInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the most likely allele if isInformative(log10ThresholdForInformative) is true, or NO_CALL otherwise
|
||||
*
|
||||
* @param log10ThresholdForInformative a log10 threshold to determine if the most likely allele was informative
|
||||
* @return a non-null allele
|
||||
*/
|
||||
public Allele getAlleleIfInformative(final double log10ThresholdForInformative) {
|
||||
return isInformative(log10ThresholdForInformative) ? getMostLikelyAllele() : Allele.NO_CALL;
|
||||
}
|
||||
}
|
||||
|
|
@ -41,10 +41,6 @@ import java.util.*;
|
|||
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
*/
|
||||
public class PerReadAlleleLikelihoodMap {
|
||||
|
||||
|
||||
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2;
|
||||
|
||||
protected List<Allele> alleles;
|
||||
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
|
||||
|
|
@ -119,9 +115,9 @@ public class PerReadAlleleLikelihoodMap {
|
|||
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
|
||||
// do not remove reduced reads!
|
||||
if ( !entry.getKey().isReducedRead() ) {
|
||||
final Allele bestAllele = getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele != Allele.NO_CALL )
|
||||
alleleReadMap.get(bestAllele).add(entry.getKey());
|
||||
final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele.isInformative() )
|
||||
alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -194,32 +190,25 @@ public class PerReadAlleleLikelihoodMap {
|
|||
|
||||
/**
|
||||
* Given a map from alleles to likelihoods, find the allele with the largest likelihood.
|
||||
* If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
* then the most likely allele is set to "no call"
|
||||
*
|
||||
* @param alleleMap - a map from alleles to likelihoods
|
||||
* @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
* of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the
|
||||
* corresponding key
|
||||
* @return - a MostLikelyAllele object
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public static Allele getMostLikelyAllele( final Map<Allele,Double> alleleMap ) {
|
||||
public static MostLikelyAllele getMostLikelyAllele( final Map<Allele,Double> alleleMap ) {
|
||||
return getMostLikelyAllele(alleleMap, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a map from alleles to likelihoods, find the allele with the largest likelihood.
|
||||
* If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
* then the most likely allele is set to "no call"
|
||||
*
|
||||
* @param alleleMap - a map from alleles to likelihoods
|
||||
* @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best.
|
||||
* this is useful for the case where you've selected a subset of the alleles that
|
||||
* the reads have been computed for further analysis. If null totally ignored
|
||||
* @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD
|
||||
* of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the
|
||||
* corresponding key
|
||||
* @return - a MostLikelyAllele object
|
||||
*/
|
||||
public static Allele getMostLikelyAllele( final Map<Allele,Double> alleleMap, final Set<Allele> onlyConsiderTheseAlleles ) {
|
||||
public static MostLikelyAllele getMostLikelyAllele( final Map<Allele,Double> alleleMap, final Set<Allele> onlyConsiderTheseAlleles ) {
|
||||
if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null");
|
||||
double maxLike = Double.NEGATIVE_INFINITY;
|
||||
double prevMaxLike = Double.NEGATIVE_INFINITY;
|
||||
|
|
@ -237,7 +226,8 @@ public class PerReadAlleleLikelihoodMap {
|
|||
prevMaxLike = el.getValue();
|
||||
}
|
||||
}
|
||||
return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL );
|
||||
|
||||
return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,27 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
|
||||
|
||||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
|
@ -71,9 +72,8 @@ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
|||
// next, output the interesting reads for each sample aligned against the appropriate haplotype
|
||||
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
|
||||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele != Allele.NO_CALL )
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart());
|
||||
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,33 +1,34 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
|
||||
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
|
@ -77,9 +78,8 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
|||
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
|
||||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
if ( entry.getKey().getMappingQuality() > 0 ) {
|
||||
final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
|
||||
if ( bestAllele != Allele.NO_CALL )
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart());
|
||||
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue