diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 5d8113212..40a6a79e0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -55,6 +55,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; @@ -161,8 +162,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); if ( shouldErrorCorrectKmers() ) { - graph = errorCorrect(graph); - if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); + throw new UserException("Error correction no longer supported because of the " + + "incredibly naive way this was implemented. The command line argument remains because some" + + " future subsystem will actually go and error correct the reads"); } final SeqGraph seqGraph = toSeqGraph(graph); @@ -214,6 +216,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); + seqGraph.simplifyGraph(); + if ( seqGraph.vertexSet().size() == 1 ) { + // we've prefectly assembled into a single reference haplotype, add a empty seq vertex to stop + // the code from blowing up. + // TODO -- ref properties should really be on the vertices, not the graph itself + final SeqVertex complete = seqGraph.vertexSet().iterator().next(); + final SeqVertex dummy = new SeqVertex(""); + seqGraph.addVertex(dummy); + seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); + } if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); return seqGraph; @@ -332,39 +344,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return true; } - /** - * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason - * we cannot actually do the error correction - */ - public DeBruijnGraph errorCorrect(final DeBruijnGraph graph) { - final KMerErrorCorrector corrector = new KMerErrorCorrector(graph.getKmerSize(), 1, 1, 5); // TODO -- should be static variables - - for( final BaseEdge e : graph.edgeSet() ) { - for ( final byte[] kmer : Arrays.asList(graph.getEdgeSource(e).getSequence(), graph.getEdgeTarget(e).getSequence())) { - // TODO -- need a cleaner way to deal with the ref weight - corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); - } - } - - if ( corrector.computeErrorCorrectionMap() ) { - final DeBruijnGraph correctedGraph = new DeBruijnGraph(graph.getKmerSize()); - - for( final BaseEdge e : graph.edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(graph.getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(graph.getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); - } - } - - return correctedGraph; - } else { - // the error correction wasn't possible, simply return this graph - return graph; - } - } - protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; @@ -461,6 +440,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + // add genome locs to the haplotypes + for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow); + if ( returnHaplotypes.size() < returnHaplotypes.size() ) logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 7cdc57464..abd502c2b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -153,7 +153,7 @@ public class GenotypingEngine { if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 53fffec61..bce179ee1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -77,6 +77,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.HaplotypeBaseComparator; import org.broadinstitute.sting.utils.haplotype.LDMerger; @@ -300,6 +301,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected int debugGraphTransformations = -1; + // TODO -- not currently useful @Hidden @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; @@ -308,6 +310,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="useNewLDMerger", shortName="useNewLDMerger", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useNewLDMerger = false; + @Hidden + @Argument(fullName="trimActiveRegions", shortName="trimActiveRegions", doc="If specified, we will trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + protected boolean trimActiveRegions = false; + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -329,6 +335,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // reference base padding size private static final int REFERENCE_PADDING = 500; + // include at least this many bases around an event for calling it + private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20; + private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150; + + // the maximum extent into the full active region extension that we're willing to go in genotyping our events + private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument @@ -490,7 +503,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final byte qual = p.getQual(); if( p.isDeletion() || qual > (byte) 18) { int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { AA = 2; BB = 0; if( p.isNextToSoftClip() ) { @@ -521,58 +534,53 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- @Override - public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + public Integer map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { if ( justDetermineActiveRegions ) // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; - final List activeAllelesToGenotype = new ArrayList(); + if( !originalActiveRegion.isActive() ) { return 0; } // Not active so nothing to do! + final List activeAllelesToGenotype = new ArrayList(); if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : allelesToGenotype ) { - if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { + if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode } } allelesToGenotype.removeAll( activeAllelesToGenotype ); + // No alleles found in this region so nothing to do! + if ( activeAllelesToGenotype.isEmpty() ) { return 0; } + } else { + if( originalActiveRegion.size() == 0 ) { return 0; } // No reads here so nothing to do! } - if( !activeRegion.isActive() ) { return 0; } // Not active so nothing to do! - if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! + // run the local assembler, getting back a collection of information on how we should proceed + final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); - finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails - - final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region - final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); - final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); - if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! - - final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria - if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! - - // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM - Collections.sort( haplotypes, new HaplotypeBaseComparator() ); - - if (dontGenotype) - return 1; + // abort early if something is out of the acceptable range + if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! + if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + if (dontGenotype) return 1; // user requested we not proceed // evaluate each sample's reads against all haplotypes - final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); + //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); + + // filter out reads from genotyping which fail mapping quality based criteria + final List filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = selectBestHaplotypesForGenotyping(haplotypes, stratifiedReadMap); + final List bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap); final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, bestHaplotypes, stratifiedReadMap, perSampleFilteredReadList, - fullReferenceWithPadding, - paddedReferenceLoc, - activeRegion.getLocation(), + assemblyResult.fullReferenceWithPadding, + assemblyResult.paddedReferenceLoc, + assemblyResult.regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ); @@ -583,7 +591,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } if ( bamWriter != null ) { - haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); + haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc, + bestHaplotypes, + calledHaplotypes.getCalledHaplotypes(), + stratifiedReadMap); } if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } @@ -591,6 +602,152 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return 1; // One active region was processed during this map call } + private final static class AssemblyResult { + final List haplotypes; + final ActiveRegion regionForGenotyping; + final byte[] fullReferenceWithPadding; + final GenomeLoc paddedReferenceLoc; + + private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) { + this.haplotypes = haplotypes; + this.regionForGenotyping = regionForGenotyping; + this.fullReferenceWithPadding = fullReferenceWithPadding; + this.paddedReferenceLoc = paddedReferenceLoc; + } + } + + /** + * High-level function that runs the assembler on the active region reads, + * returning a data structure with the resulting information needed + * for further HC steps + * + * @param activeRegion the region we should assemble + * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) + * @return the AssemblyResult describing how to proceed with genotyping + */ + protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + // Create the reference haplotype which is the bases from the reference that make up the active region + finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails + + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); + final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); + final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); + + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); + + if ( trimActiveRegions ) { + return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + } else { + // we don't want to or cannot create a trimmed active region, so go ahead and use the old one + return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc); + } + } + + /** + * Trim down the active region to just enough to properly genotype the events among the haplotypes + * + * This function merely creates the region, but it doesn't populate the reads back into the region + * + * @param region our full active region + * @param haplotypes the list of haplotypes we've created from assembly + * @param ref the reference bases over the full padded location + * @param refLoc the span of the reference bases + * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully + */ + private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List haplotypes, final byte[] ref, final GenomeLoc refLoc) { + EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + final TreeSet allContexts = EventMap.getAllVariantContexts(haplotypes); + final GenomeLocParser parser = getToolkit().getGenomeLocParser(); + + if ( allContexts.isEmpty() ) // no variants, so just return the current region + return null; + + final List withinActiveRegion = new LinkedList(); + int pad = PADDING_AROUND_SNPS_FOR_CALLING; + GenomeLoc trimLoc = null; + for ( final VariantContext vc : allContexts ) { + final GenomeLoc vcLoc = parser.createGenomeLoc(vc); + if ( region.getLocation().overlapsP(vcLoc) ) { + if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding + pad = PADDING_AROUND_OTHERS_FOR_CALLING; + trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); + withinActiveRegion.add(vc); + } + } + + // we don't actually have anything in the region after removing variants that don't overlap the region's full location + if ( trimLoc == null ) return null; + + final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION); + final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad); + final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); + + final ActiveRegion trimmedRegion = region.trim(finalSpan); + if ( DEBUG ) { + logger.info("events : " + withinActiveRegion); + logger.info("trimLoc : " + trimLoc); + logger.info("pad : " + pad); + logger.info("idealSpan : " + idealSpan); + logger.info("maxSpan : " + maxSpan); + logger.info("finalSpan : " + finalSpan); + logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); + } + return trimmedRegion; + } + + /** + * Trim down the active region to just enough to properly genotype the events among the haplotypes + * + * @param originalActiveRegion our full active region + * @param haplotypes the list of haplotypes we've created from assembly + * @param fullReferenceWithPadding the reference bases over the full padded location + * @param paddedReferenceLoc the span of the reference bases + * @return an AssemblyResult containing the trimmed active region with all of the reads we should use + * trimmed down as well, and a revised set of haplotypes. If trimming failed this function + * may choose to use the originalActiveRegion without modification + */ + private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion, + final List haplotypes, + final byte[] fullReferenceWithPadding, + final GenomeLoc paddedReferenceLoc) { + final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + + if ( trimmedActiveRegion == null ) + return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + + // trim down the haplotypes + final Set haplotypeSet = new HashSet(haplotypes.size()); + for ( final Haplotype h : haplotypes ) { + final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); + if ( trimmed != null ) { + haplotypeSet.add(trimmed); + } else if ( DEBUG ) { + logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc()); + } + } + + // create the final list of trimmed haplotypes + final List trimmedHaplotypes = new ArrayList(haplotypeSet); + + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); + + if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + + // trim down the reads and add them to the trimmed active region + final List trimmedReads = new ArrayList(originalActiveRegion.getReads().size()); + for( final GATKSAMRecord read : originalActiveRegion.getReads() ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); + if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + trimmedReads.add(clippedRead); + } + } + trimmedActiveRegion.clearReads(); + trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); + + return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + } + /** * Select the best N haplotypes according to their likelihoods, if appropriate * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java similarity index 50% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java index b051e5411..1f0903753 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java @@ -51,110 +51,31 @@ import org.apache.log4j.Logger; import java.util.*; /** - * generic utility function that error corrects kmers based on counts - * - * This class provides a generic facility for remapping kmers (byte[] of constant size) - * that occur infrequently to those that occur frequently, based on their simple edit distance - * as measured by mismatches. - * - * The overall workflow of using this class is simple. First, you create the class with - * parameters determining how the error correction should proceed. Next, you provide all - * of the kmers you see in your data. Once all kmers have been added, you call computeErrorCorrectionMap - * to tell this class that all kmers have been added and its time to determine error correcting - * mapping from observed kmers to corrected kmers. This correction looks for low-count (as determined - * by maxCountToCorrect) kmers and chooses the best kmer (minimizing mismatches) among those - * with at least minCountOfKmerToBeCorrection occurrences to error correct the kmer to. If - * there is no kmer with less than maxMismatchesToCorrect then the kmer will be mapped to - * null, indicating the kmer should not be used. - * - * TODO -- for ease of implementation this class uses strings instead of byte[] as those cannot - * TODO -- be added to hashmaps (more specifically, those don't implement .equals). A more efficient - * TODO -- version would use the byte[] directly - * - * TODO -- this is just not the right way to implement error correction in the graph. Basically, the - * right way to think about this is error correcting reads: - * - * * - * ACTGAT - * ACT - * CTG - * TGA - * GAT - * - * Now suppose the G is an error. What you are doing is asking for each 3mer in the read whether it's high quality - * or not. Suppose the answer is - * - * * - * ACTGAT - * ACT -- yes - * CTG -- no [CTG is unusual] - * TGA -- no [TGA is unusual] - * GAT -- yes [maybe GAT is just common, even through its an error] - * - * As we do this process it's clear how we can figure out which positions in the read likely harbor errors, and - * then go search around those bases in the read in an attempt to fix the read. We don't have to compute for - * every bad kmer it's best match, as that's just not the problem we are thinking looking to solve. We are actually - * looking for a change to a read such that all spanning kmers are well-supported. This class is being disabled - * until we figure implement this change. + * generic utility class that counts kmers * + * Basically you add kmers to the counter, and it tells you how many occurrences of each kmer it's seen. * * User: depristo * Date: 3/8/13 * Time: 1:16 PM */ -public class KMerErrorCorrector { - private final static Logger logger = Logger.getLogger(KMerErrorCorrector.class); - - /** - * The maximum number of bad kmer -> good kmer correction operations we'll consider doing before - * aborting for efficiency reasons. Basically, the current algorithm sucks, and is O(n^2), and - * so we cannot simply error correct 10K bad kmers against a db of 100K kmers if we ever want - * to finish running in a reasonable amount of time. This isn't worth fixing because fundamentally - * the entire error correction algorithm is just not right (i.e., it's correct but not ideal conceptually - * so we'll just fix the conceptual problem than the performance issue). - */ - private final static int MAX_CORRECTION_OPS_TO_ALLOW = 5000 * 1000; +public class KMerCounter { + private final static Logger logger = Logger.getLogger(KMerCounter.class); /** * A map of for each kmer to its num occurrences in addKmers */ - Map countsByKMer = new HashMap(); + private final Map countsByKMer = new HashMap(); + private final int kmerLength; /** - * A map from raw kmer -> error corrected kmer - */ - Map rawToErrorCorrectedMap = null; - - final int kmerLength; - final int maxCountToCorrect; - final int maxMismatchesToCorrect; - final int minCountOfKmerToBeCorrection; - - /** - * Create a new kmer corrector + * Create a new kmer counter * * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 - * @param maxCountToCorrect kmers with < maxCountToCorrect will try to be error corrected to another kmer, must be >= 0 - * @param maxMismatchesToCorrect the maximum number of mismatches between a to-be-corrected kmer and its - * best match that we attempt to error correct. If no sufficiently similar - * kmer exists, it will be remapped to null. Must be >= 1 - * @param minCountOfKmerToBeCorrection the minimum count of a kmer to be considered a target for correction. - * That is, kmers that need correction will only be matched with kmers - * with at least minCountOfKmerToBeCorrection occurrences. Must be >= 1 */ - public KMerErrorCorrector(final int kmerLength, - final int maxCountToCorrect, - final int maxMismatchesToCorrect, - final int minCountOfKmerToBeCorrection) { + public KMerCounter(final int kmerLength) { if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); - if ( maxCountToCorrect < 0 ) throw new IllegalArgumentException("maxCountToCorrect must be >= 0 but got " + maxCountToCorrect); - if ( maxMismatchesToCorrect < 1 ) throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect); - if ( minCountOfKmerToBeCorrection < 1 ) throw new IllegalArgumentException("minCountOfKmerToBeCorrection must be >= 1 but got " + minCountOfKmerToBeCorrection); - this.kmerLength = kmerLength; - this.maxCountToCorrect = maxCountToCorrect; - this.maxMismatchesToCorrect = maxMismatchesToCorrect; - this.minCountOfKmerToBeCorrection = minCountOfKmerToBeCorrection; } /** @@ -165,7 +86,17 @@ public class KMerErrorCorrector { protected void addKmers(final String ... kmers) { for ( final String kmer : kmers ) addKmer(kmer, 1); - computeErrorCorrectionMap(); + } + + /** + * Get the count of kmer in this kmer counter + * @param kmer a non-null counter to get + * @return a positive integer + */ + public int getKmerCount(final byte[] kmer) { + if ( kmer == null ) throw new IllegalArgumentException("kmer cannot be null"); + final CountedKmer counted = countsByKMer.get(new String(kmer)); + return counted == null ? 0 : counted.count; } /** @@ -178,68 +109,9 @@ public class KMerErrorCorrector { addKmer(new String(rawKmer), kmerCount); } - - /** - * Get the error corrected kmer for rawKmer - * - * @param rawKmer a kmer that was already added that we want to get an error corrected version for - * @return an error corrected kmer to use instead of rawKmer. May be == rawKmer if no error correction - * is not necessary. May be null, indicating the rawKmer shouldn't be used at all - */ - public byte[] getErrorCorrectedKmer(final byte[] rawKmer) { - final String result = getErrorCorrectedKmer(new String(rawKmer)); - return result == null ? null : result.getBytes(); - } - - /** - * Indicate that no more kmers will be added to the kmer error corrector, so that the - * error correction data structure should be computed from the added kmers. Enabled calls - * to getErrorCorrectedKmer, and disable calls to addKmer. - * - * @return true if the error correction map could actually be computed, false if for any reason - * (efficiency, memory, we're out to lunch) a correction map couldn't be created. - */ - public boolean computeErrorCorrectionMap() { - if ( countsByKMer == null ) - throw new IllegalStateException("computeErrorCorrectionMap can only be called once"); - - final LinkedList needsCorrection = new LinkedList(); - final List goodKmers = new ArrayList(countsByKMer.size()); - - rawToErrorCorrectedMap = new HashMap(countsByKMer.size()); - for ( final CountedKmer countedKmer: countsByKMer.values() ) { - if ( countedKmer.count <= maxCountToCorrect ) - needsCorrection.add(countedKmer); - else { - // todo -- optimization could make not in map mean == - rawToErrorCorrectedMap.put(countedKmer.kmer, countedKmer.kmer); - - // only allow corrections to kmers with at least this count - if ( countedKmer.count >= minCountOfKmerToBeCorrection ) - goodKmers.add(countedKmer); - } - } - - // cleanup memory -- we don't need the counts for each kmer any longer - countsByKMer = null; - - if ( goodKmers.size() * needsCorrection.size() > MAX_CORRECTION_OPS_TO_ALLOW ) - return false; - else { - Collections.sort(goodKmers); - for ( final CountedKmer toCorrect : needsCorrection ) { - final String corrected = findClosestKMer(toCorrect, goodKmers); - rawToErrorCorrectedMap.put(toCorrect.kmer, corrected); - } - - return true; - } - } - protected void addKmer(final String rawKmer, final int kmerCount) { if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); - if ( countsByKMer == null ) throw new IllegalStateException("Cannot add kmers to an already finalized error corrector"); CountedKmer countFromMap = countsByKMer.get(rawKmer); if ( countFromMap == null ) { @@ -249,55 +121,10 @@ public class KMerErrorCorrector { countFromMap.count += kmerCount; } - protected String findClosestKMer(final CountedKmer kmer, final Collection goodKmers) { - String bestMatch = null; - int minMismatches = Integer.MAX_VALUE; - - for ( final CountedKmer goodKmer : goodKmers ) { - final int mismatches = countMismatches(kmer.kmer, goodKmer.kmer, minMismatches); - if ( mismatches < minMismatches ) { - minMismatches = mismatches; - bestMatch = goodKmer.kmer; - } - - // if we find an edit-distance 1 result, abort early, as we know there can be no edit distance 0 results - if ( mismatches == 1 ) - break; - } - - return minMismatches > maxMismatchesToCorrect ? null : bestMatch; - } - - protected int countMismatches(final String one, final String two, final int currentBest) { - int mismatches = 0; - for ( int i = 0; i < one.length(); i++ ) { - mismatches += one.charAt(i) == two.charAt(i) ? 0 : 1; - if ( mismatches > currentBest ) - break; - if ( mismatches > maxMismatchesToCorrect ) - return Integer.MAX_VALUE; - } - return mismatches; - } - - protected String getErrorCorrectedKmer(final String rawKmer) { - if ( rawToErrorCorrectedMap == null ) throw new IllegalStateException("Cannot get error corrected kmers until after computeErrorCorrectionMap has been called"); - if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); - return rawToErrorCorrectedMap.get(rawKmer); - } - @Override public String toString() { - final StringBuilder b = new StringBuilder("KMerErrorCorrector{"); - if ( rawToErrorCorrectedMap == null ) { - b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); - } else { - for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { - final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); - if ( correcting ) - b.append(String.format("%n\tCorrecting %s -> %s", toCorrect.getKey(), toCorrect.getValue())); - } - } + final StringBuilder b = new StringBuilder("KMerCounter{"); + b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); b.append("\n}"); return b.toString(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index 66085fcad..c11841dac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerErrorCorrector; import java.util.Arrays; import java.util.HashMap; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java new file mode 100644 index 000000000..56197047b --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KMerCounterUnitTest extends BaseTest { + @Test + public void testMyData() { + final KMerCounter counter = new KMerCounter(3); + + Assert.assertNotNull(counter.toString()); + + counter.addKmers( + "ATG", "ATG", "ATG", "ATG", + "ACC", "ACC", "ACC", + "AAA", "AAA", + "CTG", + "NNA", + "CCC" + ); + + testCounting(counter, "ATG", 4); + testCounting(counter, "ACC", 3); + testCounting(counter, "AAA", 2); + testCounting(counter, "CTG", 1); + testCounting(counter, "NNA", 1); + testCounting(counter, "CCC", 1); + testCounting(counter, "NNN", 0); + testCounting(counter, "NNC", 0); + + Assert.assertNotNull(counter.toString()); + } + + private void testCounting(final KMerCounter counter, final String in, final int expectedCount) { + Assert.assertEquals(counter.getKmerCount(in.getBytes()), expectedCount); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java deleted file mode 100644 index f8a540b70..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -public class KMerErrorCorrectorUnitTest extends BaseTest { - @Test - public void testMyData() { - final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2); - - Assert.assertNotNull(corrector.toString()); - - corrector.addKmers( - "ATG", "ATG", "ATG", "ATG", - "ACC", "ACC", "ACC", - "AAA", "AAA", - "CTG", // -> ATG - "NNA", // -> AAA - "CCC", // => ACC - "NNN", // => null - "NNC" // => ACC [because of min count won't go to NNA] - ); - - testCorrection(corrector, "ATG", "ATG"); - testCorrection(corrector, "ACC", "ACC"); - testCorrection(corrector, "AAA", "AAA"); - testCorrection(corrector, "CTG", "ATG"); - testCorrection(corrector, "NNA", "AAA"); - testCorrection(corrector, "CCC", "ACC"); - testCorrection(corrector, "NNN", null); - testCorrection(corrector, "NNC", "ACC"); - - Assert.assertNotNull(corrector.toString()); - } - - private void testCorrection(final KMerErrorCorrector corrector, final String in, final String out) { - Assert.assertEquals(corrector.getErrorCorrectedKmer(in), out); - Assert.assertEquals(corrector.getErrorCorrectedKmer(in.getBytes()), out == null ? null : out.getBytes()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index b38d6575e..2f4c1b55d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -149,7 +149,7 @@ public class ActiveRegion implements HasGenomeLocation { @Override public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size() + " "; + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); } /** @@ -374,6 +374,8 @@ public class ActiveRegion implements HasGenomeLocation { * * Note that the returned list may be empty, if this active region doesn't overlap the set at all * + * Note that the resulting regions are all empty, regardless of whether the current active region has reads + * * @param intervals a non-null set of intervals that are allowed * @return an ordered list of active region where each interval is contained within intervals */ @@ -383,14 +385,59 @@ public class ActiveRegion implements HasGenomeLocation { final List clippedRegions = new LinkedList(); for ( final GenomeLoc overlapping : allOverlapping ) { - final GenomeLoc subLoc = getLocation().intersect(overlapping); - final int subStart = subLoc.getStart() - getLocation().getStart(); - final int subEnd = subStart + subLoc.size(); - final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); - final ActiveRegion clipped = new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension ); - clippedRegions.add(clipped); + clippedRegions.add(trim(overlapping, extension)); } return clippedRegions; } + + /** + * Trim this active to just the newExtent, producing a new active region without any reads that has only + * the extent of newExtend intersected with the current extent + * @param newExtent the new extend of the active region we want + * @param newExtension the extension size we want for the newly trimmed active region + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc newExtent, final int newExtension) { + if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + + final GenomeLoc subLoc = getLocation().intersect(newExtent); + final int subStart = subLoc.getStart() - getLocation().getStart(); + final int subEnd = subStart + subLoc.size(); + final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); + return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, newExtension ); + } + + /** + * Trim this active to no more than the newExtent, producing a new active region without any reads that + * attempts to provide the best possible representation of this active region covering the newExtent. + * + * The challenge here is that newExtent may (1) be larger than can be represented by this active region + * + its original extension and (2) the extension must be symmetric on both sides. This algorithm + * therefore determines how best to represent newExtent as a subset of the span of this + * region with a padding value that captures as much of the newExtent as possible. + * + * For example, suppose this active region is + * + * Active: 100-200 with extension of 50, so that the true span is 50-250 + * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 + * + * Here we represent the active region as a active region from 150-200 with 25 bp of padding. + * + * The overall constraint is that the active region can never exceed the original active region, and + * the extension is chosen to maximize overlap with the desired region + * + * @param newExtent the new extend of the active region we want + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc newExtent) { + if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + + final GenomeLoc subActive = getLocation().intersect(newExtent); + final int requiredOnRight = Math.max(newExtent.getStop() - subActive.getStop(), 0); + final int requiredOnLeft = Math.max(subActive.getStart() - newExtent.getStart(), 0); + final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); + + return new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java index 1d33e328d..ab5f23894 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java @@ -390,4 +390,27 @@ public class EventMap extends TreeMap { return startPosKeySet; } + + private static class VariantContextComparator implements Comparator { + @Override + public int compare(VariantContext vc1, VariantContext vc2) { + return vc1.getStart() - vc2.getStart(); + } + } + + /** + * Get all of the VariantContexts in the event maps for all haplotypes, sorted by their start position + * @param haplotypes the set of haplotypes to grab the VCs from + * @return a sorted set of variant contexts + */ + public static TreeSet getAllVariantContexts( final List haplotypes ) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet vcs = new TreeSet(new VariantContextComparator()); + + for( final Haplotype h : haplotypes ) { + vcs.addAll(h.getEventMap().getVariantContexts()); + } + + return vcs; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 081fd14e0..bacee7942 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -103,6 +103,40 @@ public class Haplotype extends Allele { this.genomeLocation = loc; } + /** + * Create a new Haplotype derived from this one that exactly spans the provided location + * + * Note that this haplotype must have a contain a genome loc for this operation to be successful. If no + * GenomeLoc is contained than @throws an IllegalStateException + * + * Also loc must be fully contained within this Haplotype's genomeLoc. If not an IllegalArgumentException is + * thrown. + * + * @param loc a location completely contained within this Haplotype's location + * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if + */ + public Haplotype trim(final GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("Loc cannot be null"); + if ( genomeLocation == null ) throw new IllegalStateException("Cannot trim a Haplotype without containing GenomeLoc"); + if ( ! genomeLocation.containsP(loc) ) throw new IllegalArgumentException("Can only trim a Haplotype to a containing span. My loc is " + genomeLocation + " but wanted trim to " + loc); + if ( getCigar() == null ) throw new IllegalArgumentException("Cannot trim haplotype without a cigar " + this); + + final int newStart = loc.getStart() - this.genomeLocation.getStart(); + final int newStop = newStart + loc.size() - 1; + final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar()); + final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop); + + if ( newBases == null || AlignmentUtils.startsOrEndsWithInsertionOrDeletion(newCigar) ) + // we cannot meaningfully chop down the haplotype, so return null + return null; + + final Haplotype ret = new Haplotype(newBases, isReference()); + ret.setCigar(newCigar); + ret.setGenomeLocation(loc); + ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); + return ret; + } + @Override public boolean equals( Object h ) { return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); @@ -126,6 +160,18 @@ public class Haplotype extends Allele { return getDisplayString(); } + /** + * Get the span of this haplotype (may be null) + * @return a potentially null genome loc + */ + public GenomeLoc getGenomeLocation() { + return genomeLocation; + } + + public void setGenomeLocation(GenomeLoc genomeLocation) { + this.genomeLocation = genomeLocation; + } + public long getStartPosition() { return genomeLocation.getStart(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 9b25b00c6..2208302fb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -48,6 +48,24 @@ public final class AlignmentUtils { // cannot be instantiated private AlignmentUtils() { } + /** + * Does cigar start or end with a deletion operation? + * + * @param cigar a non-null cigar to test + * @return true if the first or last operator of cigar is a D + */ + public static boolean startsOrEndsWithInsertionOrDeletion(final Cigar cigar) { + if ( cigar == null ) throw new IllegalArgumentException("Cigar cannot be null"); + + if ( cigar.isEmpty() ) + return false; + + final CigarOperator first = cigar.getCigarElement(0).getOperator(); + final CigarOperator last = cigar.getCigarElement(cigar.numCigarElements()-1).getOperator(); + return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I; + } + + /** * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference @@ -55,6 +73,8 @@ public final class AlignmentUtils { * refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X. * + * If refStart or refEnd would start or end the new bases within a deletion, this function will return null + * * @param bases * @param refStart * @param refEnd @@ -63,7 +83,7 @@ public final class AlignmentUtils { * 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10. * It's not trivial to eliminate this parameter because it's tied up with the cigar * @param basesToRefCigar the cigar that maps the bases to the reference genome - * @return a non-null byte[] + * @return a byte[] containing the bases covering this interval, or null if we would start or end within a deletion */ public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) { if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd); @@ -74,33 +94,41 @@ public final class AlignmentUtils { int refPos = basesStartOnRef; int basesPos = 0; - int basesStart = -1; int basesStop = -1; boolean done = false; for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) { final CigarElement ce = basesToRefCigar.getCigarElement(iii); - final int bInc, rInc; switch ( ce.getOperator() ) { - case I: bInc = 1; rInc = 0; break; - case M: case X: case EQ: bInc = rInc = 1; break; - case D: bInc = 0; rInc = 1; break; + case I: + basesPos += ce.getLength(); + break; + case M: case X: case EQ: + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refStart ) + basesStart = basesPos; + if ( refPos == refEnd ) { + basesStop = basesPos; + done = true; + break; + } + refPos++; + basesPos++; + } + break; + case D: + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refEnd || refPos == refStart ) { + // if we ever reach a ref position that is either a start or an end, we fail + return null; + } + refPos++; + } + break; default: throw new IllegalStateException("Unsupported operator " + ce); } - - for ( int i = 0; i < ce.getLength(); i++ ) { - if ( refPos == refStart ) - basesStart = basesPos; - if ( refPos == refEnd ) { - basesStop = basesPos; - done = true; - break; - } - refPos += rInc; - basesPos += bInc; - } } if ( basesStart == -1 || basesStop == -1 ) diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java index 7f0f93704..ad5fd3642 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -49,7 +49,7 @@ import java.util.*; public class ActiveRegionUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; private IndexedFastaSequenceFile seq; private String contig; @@ -309,4 +309,75 @@ public class ActiveRegionUnitTest extends BaseTest { } } } + + // ----------------------------------------------------------------------------------------------- + // + // Make sure we can properly cut up an active region based on engine intervals + // + // ----------------------------------------------------------------------------------------------- + + @DataProvider(name = "TrimActiveRegionData") + public Object[][] makeTrimActiveRegionData() { + List tests = new ArrayList(); + + // fully enclosed within active region + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 16), + genomeLocParser.createGenomeLoc("20", 15, 16), 0}); + + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 10, 15), + genomeLocParser.createGenomeLoc("20", 10, 15), 0}); + + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 20), + genomeLocParser.createGenomeLoc("20", 15, 20), 0}); + + // needs extra padding on the right + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 25), + genomeLocParser.createGenomeLoc("20", 15, 20), 5}); + + // needs extra padding on the left + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 5, 15), + genomeLocParser.createGenomeLoc("20", 10, 15), 5}); + + // needs extra padding on both + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 7, 21), + genomeLocParser.createGenomeLoc("20", 10, 20), 3}); + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 9, 23), + genomeLocParser.createGenomeLoc("20", 10, 20), 3}); + + // desired span captures everything, so we're returning everything. Tests that extension is set correctly + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 1, 50), + genomeLocParser.createGenomeLoc("20", 10, 20), 10}); + + // At the start of the chromosome, potentially a bit weird + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 1, 10), 10, + genomeLocParser.createGenomeLoc("20", 1, 50), + genomeLocParser.createGenomeLoc("20", 1, 10), 10}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimActiveRegionData") + public void testTrimActiveRegion(final GenomeLoc regionLoc, final int extension, final GenomeLoc desiredSpan, final GenomeLoc expectedActiveRegion, final int expectedExtension) { + final ActiveRegion region = new ActiveRegion(regionLoc, Collections.emptyList(), true, genomeLocParser, extension); + final ActiveRegion trimmed = region.trim(desiredSpan); + Assert.assertEquals(trimmed.getLocation(), expectedActiveRegion, "Incorrect region"); + Assert.assertEquals(trimmed.getExtension(), expectedExtension, "Incorrect region"); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java index fe02aea9f..cfbc4a3e0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java @@ -31,12 +31,15 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -45,10 +48,6 @@ import java.util.*; * Basic unit test for Haplotype Class */ public class HaplotypeUnitTest extends BaseTest { - @BeforeClass - public void init() { - } - @Test public void testSimpleInsertionAllele() { final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; @@ -183,4 +182,68 @@ public class HaplotypeUnitTest extends BaseTest { Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(1).toString(), "1M3I1M"); Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(2).toString(), "1M3I2M"); } + + @DataProvider(name = "TrimmingData") + public Object[][] makeTrimmingData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, 10, 20); + final String fullBases = "ACGTAACCGGT"; + for ( int trimStart = loc.getStart(); trimStart < loc.getStop(); trimStart++ ) { + for ( int trimStop = trimStart; trimStop <= loc.getStop(); trimStop++ ) { + final int start = trimStart - loc.getStart(); + final int stop = start + (trimStop - trimStart) + 1; + final GenomeLoc trimmedLoc = new UnvalidatingGenomeLoc("20", 0, start + loc.getStart(), stop + loc.getStart() - 1); + final String expectedBases = fullBases.substring(start, stop); + final Haplotype full = new Haplotype(fullBases.getBytes(), loc); + final Haplotype trimmed = new Haplotype(expectedBases.getBytes(), trimmedLoc); + + final int hapStart = 10; + full.setAlignmentStartHapwrtRef(hapStart); + full.setCigar(TextCigarCodec.getSingleton().decode(full.length() + "M")); + + trimmed.setAlignmentStartHapwrtRef(hapStart + start); + trimmed.setCigar(TextCigarCodec.getSingleton().decode(trimmed.length() + "M")); + + tests.add(new Object[]{full, trimmedLoc, trimmed}); + } + } + + final Haplotype full = new Haplotype("ACT".getBytes(), new UnvalidatingGenomeLoc("20", 0, 10, 14)); + full.setAlignmentStartHapwrtRef(10); + full.setCigar(TextCigarCodec.getSingleton().decode("1M2D2M")); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 11, 12), null}); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 10, 12), null}); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 11, 13), null}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimmingData") + public void testTrim(final Haplotype full, final GenomeLoc trimTo, final Haplotype expected) { + final Haplotype actual = full.trim(trimTo); + if ( expected != null ) { + Assert.assertEquals(actual.getBases(), expected.getBases()); + Assert.assertEquals(actual.getStartPosition(), trimTo.getStart()); + Assert.assertEquals(actual.getStopPosition(), trimTo.getStop()); + Assert.assertEquals(actual.getCigar(), expected.getCigar()); + Assert.assertEquals(actual.getAlignmentStartHapwrtRef(), expected.getAlignmentStartHapwrtRef()); + } else { + Assert.assertNull(actual); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadTrimLoc() { + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, 10, 20); + final Haplotype hap = new Haplotype("ACGTAACCGGT".getBytes(), loc); + hap.trim(new UnvalidatingGenomeLoc("20", 0, 1, 20)); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testBadTrimNoLoc() { + final Haplotype hap = new Haplotype("ACGTAACCGGT".getBytes()); + hap.trim(new UnvalidatingGenomeLoc("20", 0, 1, 20)); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index 125450257..2a2d80206 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -948,4 +948,89 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(actualEndPos, pos + elt.getLength()); Assert.assertEquals(AlignmentUtils.consolidateCigar(new Cigar(elts)), expectedCigar); } + + @DataProvider(name = "GetBasesCoveringRefIntervalData") + public Object[][] makeGetBasesCoveringRefIntervalData() { + List tests = new ArrayList(); + + // matches + // 0123 + // ACGT + tests.add(new Object[]{"ACGT", 0, 3, "4M", "ACGT"}); + tests.add(new Object[]{"ACGT", 1, 3, "4M", "CGT"}); + tests.add(new Object[]{"ACGT", 1, 2, "4M", "CG"}); + tests.add(new Object[]{"ACGT", 1, 1, "4M", "C"}); + + // deletions + // 012345 + // AC--GT + tests.add(new Object[]{"ACGT", 0, 5, "2M2D2M", "ACGT"}); + tests.add(new Object[]{"ACGT", 1, 5, "2M2D2M", "CGT"}); + tests.add(new Object[]{"ACGT", 2, 5, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 3, 5, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 4, 5, "2M2D2M", "GT"}); + tests.add(new Object[]{"ACGT", 5, 5, "2M2D2M", "T"}); + tests.add(new Object[]{"ACGT", 0, 4, "2M2D2M", "ACG"}); + tests.add(new Object[]{"ACGT", 0, 3, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 0, 2, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 0, 1, "2M2D2M", "AC"}); + tests.add(new Object[]{"ACGT", 0, 0, "2M2D2M", "A"}); + + // insertions + // 01--23 + // ACTTGT + tests.add(new Object[]{"ACTTGT", 0, 3, "2M2I2M", "ACTTGT"}); + tests.add(new Object[]{"ACTTGT", 1, 3, "2M2I2M", "CTTGT"}); + tests.add(new Object[]{"ACTTGT", 2, 3, "2M2I2M", "GT"}); + tests.add(new Object[]{"ACTTGT", 3, 3, "2M2I2M", "T"}); + tests.add(new Object[]{"ACTTGT", 0, 2, "2M2I2M", "ACTTG"}); + tests.add(new Object[]{"ACTTGT", 0, 1, "2M2I2M", "AC"}); + tests.add(new Object[]{"ACTTGT", 1, 2, "2M2I2M", "CTTG"}); + tests.add(new Object[]{"ACTTGT", 2, 2, "2M2I2M", "G"}); + tests.add(new Object[]{"ACTTGT", 1, 1, "2M2I2M", "C"}); + + tests.add(new Object[]{"ACGT", 0, 1, "2M2I", "AC"}); + tests.add(new Object[]{"ACGT", 1, 1, "2M2I", "C"}); + tests.add(new Object[]{"ACGT", 0, 0, "2M2I", "A"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetBasesCoveringRefIntervalData", enabled = true) + public void testGetBasesCoveringRefInterval(final String basesString, final int refStart, final int refEnd, final String cigarString, final String expected) { + final byte[] actualBytes = AlignmentUtils.getBasesCoveringRefInterval(refStart, refEnd, basesString.getBytes(), 0, TextCigarCodec.getSingleton().decode(cigarString)); + if ( expected == null ) + Assert.assertNull(actualBytes); + else + Assert.assertEquals(new String(actualBytes), expected); + } + + @DataProvider(name = "StartsOrEndsWithInsertionOrDeletionData") + public Object[][] makeStartsOrEndsWithInsertionOrDeletionData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"2M", false}); + tests.add(new Object[]{"1D2M", true}); + tests.add(new Object[]{"2M1D", true}); + tests.add(new Object[]{"2M1I", true}); + tests.add(new Object[]{"1I2M", true}); + tests.add(new Object[]{"1M1I2M", false}); + tests.add(new Object[]{"1M1D2M", false}); + tests.add(new Object[]{"1M1I2M1I", true}); + tests.add(new Object[]{"1M1I2M1D", true}); + tests.add(new Object[]{"1D1M1I2M", true}); + tests.add(new Object[]{"1I1M1I2M", true}); + tests.add(new Object[]{"1M1I2M1I1M", false}); + tests.add(new Object[]{"1M1I2M1D1M", false}); + tests.add(new Object[]{"1M1D2M1D1M", false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true) + public void testStartsOrEndsWithInsertionOrDeletion(final String cigar, final boolean expected) { + Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected); + } + + }